drivers/md/dm-cache-target.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2012 Red Hat. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm.h"
   9 #include "dm-bio-prison-v2.h"
  10 #include "dm-bio-record.h"
  11 #include "dm-cache-metadata.h"
  12 #include "dm-io-tracker.h"
  13
  14 #include <linux/dm-io.h>
  15 #include <linux/dm-kcopyd.h>
  16 #include <linux/jiffies.h>
  17 #include <linux/init.h>
  18 #include <linux/mempool.h>
  19 #include <linux/module.h>
  20 #include <linux/rwsem.h>
  21 #include <linux/slab.h>
  22 #include <linux/vmalloc.h>
  23
  24 #define DM_MSG_PREFIX "cache"
  25
  26 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  27         "A percentage of time allocated for copying to and/or from cache");
  28
  29 /*----------------------------------------------------------------*/
  30
  31 /*
  32  * Glossary:
  33  *
  34  * oblock: index of an origin block
  35  * cblock: index of a cache block
  36  * promotion: movement of a block from origin to cache
  37  * demotion: movement of a block from cache to origin
  38  * migration: movement of a block between the origin and cache device,
  39  *            either direction
  40  */
  41
  42 /*----------------------------------------------------------------*/
  43
  44 /*
  45  * Represents a chunk of future work.  'input' allows continuations to pass
  46  * values between themselves, typically error values.
  47  */
  48 struct continuation {
  49         struct work_struct ws;
  50         blk_status_t input;
  51 };
  52
  53 static inline void init_continuation(struct continuation *k,
  54                                      void (*fn)(struct work_struct *))
  55 {
  56         INIT_WORK(&k->ws, fn);
  57         k->input = 0;
  58 }
  59
  60 static inline void queue_continuation(struct workqueue_struct *wq,
  61                                       struct continuation *k)
  62 {
  63         queue_work(wq, &k->ws);
  64 }
  65
  66 /*----------------------------------------------------------------*/
  67
  68 /*
  69  * The batcher collects together pieces of work that need a particular
  70  * operation to occur before they can proceed (typically a commit).
  71  */
  72 struct batcher {
  73         /*
  74          * The operation that everyone is waiting for.
  75          */
  76         blk_status_t (*commit_op)(void *context);
  77         void *commit_context;
  78
  79         /*
  80          * This is how bios should be issued once the commit op is complete
  81          * (accounted_request).
  82          */
  83         void (*issue_op)(struct bio *bio, void *context);
  84         void *issue_context;
  85
  86         /*
  87          * Queued work gets put on here after commit.
  88          */
  89         struct workqueue_struct *wq;
  90
  91         spinlock_t lock;
  92         struct list_head work_items;
  93         struct bio_list bios;
  94         struct work_struct commit_work;
  95
  96         bool commit_scheduled;
  97 };
  98
  99 static void __commit(struct work_struct *_ws)
 100 {
 101         struct batcher *b = container_of(_ws, struct batcher, commit_work);
 102         blk_status_t r;
 103         struct list_head work_items;
 104         struct work_struct *ws, *tmp;
 105         struct continuation *k;
 106         struct bio *bio;
 107         struct bio_list bios;
 108
 109         INIT_LIST_HEAD(&work_items);
 110         bio_list_init(&bios);
 111
 112         /*
 113          * We have to grab these before the commit_op to avoid a race
 114          * condition.
 115          */
 116         spin_lock_irq(&b->lock);
 117         list_splice_init(&b->work_items, &work_items);
 118         bio_list_merge_init(&bios, &b->bios);
 119         b->commit_scheduled = false;
 120         spin_unlock_irq(&b->lock);
 121
 122         r = b->commit_op(b->commit_context);
 123
 124         list_for_each_entry_safe(ws, tmp, &work_items, entry) {
 125                 k = container_of(ws, struct continuation, ws);
 126                 k->input = r;
 127                 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
 128                 queue_work(b->wq, ws);
 129         }
 130
 131         while ((bio = bio_list_pop(&bios))) {
 132                 if (r) {
 133                         bio->bi_status = r;
 134                         bio_endio(bio);
 135                 } else
 136                         b->issue_op(bio, b->issue_context);
 137         }
 138 }
 139
 140 static void batcher_init(struct batcher *b,
 141                          blk_status_t (*commit_op)(void *),
 142                          void *commit_context,
 143                          void (*issue_op)(struct bio *bio, void *),
 144                          void *issue_context,
 145                          struct workqueue_struct *wq)
 146 {
 147         b->commit_op = commit_op;
 148         b->commit_context = commit_context;
 149         b->issue_op = issue_op;
 150         b->issue_context = issue_context;
 151         b->wq = wq;
 152
 153         spin_lock_init(&b->lock);
 154         INIT_LIST_HEAD(&b->work_items);
 155         bio_list_init(&b->bios);
 156         INIT_WORK(&b->commit_work, __commit);
 157         b->commit_scheduled = false;
 158 }
 159
 160 static void async_commit(struct batcher *b)
 161 {
 162         queue_work(b->wq, &b->commit_work);
 163 }
 164
 165 static void continue_after_commit(struct batcher *b, struct continuation *k)
 166 {
 167         bool commit_scheduled;
 168
 169         spin_lock_irq(&b->lock);
 170         commit_scheduled = b->commit_scheduled;
 171         list_add_tail(&k->ws.entry, &b->work_items);
 172         spin_unlock_irq(&b->lock);
 173
 174         if (commit_scheduled)
 175                 async_commit(b);
 176 }
 177
 178 /*
 179  * Bios are errored if commit failed.
 180  */
 181 static void issue_after_commit(struct batcher *b, struct bio *bio)
 182 {
 183         bool commit_scheduled;
 184
 185         spin_lock_irq(&b->lock);
 186         commit_scheduled = b->commit_scheduled;
 187         bio_list_add(&b->bios, bio);
 188         spin_unlock_irq(&b->lock);
 189
 190         if (commit_scheduled)
 191                 async_commit(b);
 192 }
 193
 194 /*
 195  * Call this if some urgent work is waiting for the commit to complete.
 196  */
 197 static void schedule_commit(struct batcher *b)
 198 {
 199         bool immediate;
 200
 201         spin_lock_irq(&b->lock);
 202         immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
 203         b->commit_scheduled = true;
 204         spin_unlock_irq(&b->lock);
 205
 206         if (immediate)
 207                 async_commit(b);
 208 }
 209
 210 /*
 211  * There are a couple of places where we let a bio run, but want to do some
 212  * work before calling its endio function.  We do this by temporarily
 213  * changing the endio fn.
 214  */
 215 struct dm_hook_info {
 216         bio_end_io_t *bi_end_io;
 217 };
 218
 219 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 220                         bio_end_io_t *bi_end_io, void *bi_private)
 221 {
 222         h->bi_end_io = bio->bi_end_io;
 223
 224         bio->bi_end_io = bi_end_io;
 225         bio->bi_private = bi_private;
 226 }
 227
 228 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 229 {
 230         bio->bi_end_io = h->bi_end_io;
 231 }
 232
 233 /*----------------------------------------------------------------*/
 234
 235 #define MIGRATION_POOL_SIZE 128
 236 #define COMMIT_PERIOD HZ
 237 #define MIGRATION_COUNT_WINDOW 10
 238
 239 /*
 240  * The block size of the device holding cache data must be
 241  * between 32KB and 1GB.
 242  */
 243 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 244 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 245
 246 enum cache_metadata_mode {
 247         CM_WRITE,               /* metadata may be changed */
 248         CM_READ_ONLY,           /* metadata may not be changed */
 249         CM_FAIL
 250 };
 251
 252 enum cache_io_mode {
 253         /*
 254          * Data is written to cached blocks only.  These blocks are marked
 255          * dirty.  If you lose the cache device you will lose data.
 256          * Potential performance increase for both reads and writes.
 257          */
 258         CM_IO_WRITEBACK,
 259
 260         /*
 261          * Data is written to both cache and origin.  Blocks are never
 262          * dirty.  Potential performance benfit for reads only.
 263          */
 264         CM_IO_WRITETHROUGH,
 265
 266         /*
 267          * A degraded mode useful for various cache coherency situations
 268          * (eg, rolling back snapshots).  Reads and writes always go to the
 269          * origin.  If a write goes to a cached oblock, then the cache
 270          * block is invalidated.
 271          */
 272         CM_IO_PASSTHROUGH
 273 };
 274
 275 struct cache_features {
 276         enum cache_metadata_mode mode;
 277         enum cache_io_mode io_mode;
 278         unsigned int metadata_version;
 279         bool discard_passdown:1;
 280 };
 281
 282 struct cache_stats {
 283         atomic_t read_hit;
 284         atomic_t read_miss;
 285         atomic_t write_hit;
 286         atomic_t write_miss;
 287         atomic_t demotion;
 288         atomic_t promotion;
 289         atomic_t writeback;
 290         atomic_t copies_avoided;
 291         atomic_t cache_cell_clash;
 292         atomic_t commit_count;
 293         atomic_t discard_count;
 294 };
 295
 296 struct cache {
 297         struct dm_target *ti;
 298         spinlock_t lock;
 299
 300         /*
 301          * Fields for converting from sectors to blocks.
 302          */
 303         int sectors_per_block_shift;
 304         sector_t sectors_per_block;
 305
 306         struct dm_cache_metadata *cmd;
 307
 308         /*
 309          * Metadata is written to this device.
 310          */
 311         struct dm_dev *metadata_dev;
 312
 313         /*
 314          * The slower of the two data devices.  Typically a spindle.
 315          */
 316         struct dm_dev *origin_dev;
 317
 318         /*
 319          * The faster of the two data devices.  Typically an SSD.
 320          */
 321         struct dm_dev *cache_dev;
 322
 323         /*
 324          * Size of the origin device in _complete_ blocks and native sectors.
 325          */
 326         dm_oblock_t origin_blocks;
 327         sector_t origin_sectors;
 328
 329         /*
 330          * Size of the cache device in blocks.
 331          */
 332         dm_cblock_t cache_size;
 333
 334         /*
 335          * Invalidation fields.
 336          */
 337         spinlock_t invalidation_lock;
 338         struct list_head invalidation_requests;
 339
 340         sector_t migration_threshold;
 341         wait_queue_head_t migration_wait;
 342         atomic_t nr_allocated_migrations;
 343
 344         /*
 345          * The number of in flight migrations that are performing
 346          * background io. eg, promotion, writeback.
 347          */
 348         atomic_t nr_io_migrations;
 349
 350         struct bio_list deferred_bios;
 351
 352         struct rw_semaphore quiesce_lock;
 353
 354         /*
 355          * origin_blocks entries, discarded if set.
 356          */
 357         dm_dblock_t discard_nr_blocks;
 358         unsigned long *discard_bitset;
 359         uint32_t discard_block_size; /* a power of 2 times sectors per block */
 360
 361         /*
 362          * Rather than reconstructing the table line for the status we just
 363          * save it and regurgitate.
 364          */
 365         unsigned int nr_ctr_args;
 366         const char **ctr_args;
 367
 368         struct dm_kcopyd_client *copier;
 369         struct work_struct deferred_bio_worker;
 370         struct work_struct migration_worker;
 371         struct workqueue_struct *wq;
 372         struct delayed_work waker;
 373         struct dm_bio_prison_v2 *prison;
 374
 375         /*
 376          * cache_size entries, dirty if set
 377          */
 378         unsigned long *dirty_bitset;
 379         atomic_t nr_dirty;
 380
 381         unsigned int policy_nr_args;
 382         struct dm_cache_policy *policy;
 383
 384         /*
 385          * Cache features such as write-through.
 386          */
 387         struct cache_features features;
 388
 389         struct cache_stats stats;
 390
 391         bool need_tick_bio:1;
 392         bool sized:1;
 393         bool invalidate:1;
 394         bool commit_requested:1;
 395         bool loaded_mappings:1;
 396         bool loaded_discards:1;
 397
 398         struct rw_semaphore background_work_lock;
 399
 400         struct batcher committer;
 401         struct work_struct commit_ws;
 402
 403         struct dm_io_tracker tracker;
 404
 405         mempool_t migration_pool;
 406
 407         struct bio_set bs;
 408 };
 409
 410 struct per_bio_data {
 411         bool tick:1;
 412         unsigned int req_nr:2;
 413         struct dm_bio_prison_cell_v2 *cell;
 414         struct dm_hook_info hook_info;
 415         sector_t len;
 416 };
 417
 418 struct dm_cache_migration {
 419         struct continuation k;
 420         struct cache *cache;
 421
 422         struct policy_work *op;
 423         struct bio *overwrite_bio;
 424         struct dm_bio_prison_cell_v2 *cell;
 425
 426         dm_cblock_t invalidate_cblock;
 427         dm_oblock_t invalidate_oblock;
 428 };
 429
 430 /*----------------------------------------------------------------*/
 431
 432 static bool writethrough_mode(struct cache *cache)
 433 {
 434         return cache->features.io_mode == CM_IO_WRITETHROUGH;
 435 }
 436
 437 static bool writeback_mode(struct cache *cache)
 438 {
 439         return cache->features.io_mode == CM_IO_WRITEBACK;
 440 }
 441
 442 static inline bool passthrough_mode(struct cache *cache)
 443 {
 444         return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
 445 }
 446
 447 /*----------------------------------------------------------------*/
 448
 449 static void wake_deferred_bio_worker(struct cache *cache)
 450 {
 451         queue_work(cache->wq, &cache->deferred_bio_worker);
 452 }
 453
 454 static void wake_migration_worker(struct cache *cache)
 455 {
 456         if (passthrough_mode(cache))
 457                 return;
 458
 459         queue_work(cache->wq, &cache->migration_worker);
 460 }
 461
 462 /*----------------------------------------------------------------*/
 463
 464 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
 465 {
 466         return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
 467 }
 468
 469 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
 470 {
 471         dm_bio_prison_free_cell_v2(cache->prison, cell);
 472 }
 473
 474 static struct dm_cache_migration *alloc_migration(struct cache *cache)
 475 {
 476         struct dm_cache_migration *mg;
 477
 478         mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
 479
 480         memset(mg, 0, sizeof(*mg));
 481
 482         mg->cache = cache;
 483         atomic_inc(&cache->nr_allocated_migrations);
 484
 485         return mg;
 486 }
 487
 488 static void free_migration(struct dm_cache_migration *mg)
 489 {
 490         struct cache *cache = mg->cache;
 491
 492         if (atomic_dec_and_test(&cache->nr_allocated_migrations))
 493                 wake_up(&cache->migration_wait);
 494
 495         mempool_free(mg, &cache->migration_pool);
 496 }
 497
 498 /*----------------------------------------------------------------*/
 499
 500 static inline dm_oblock_t oblock_succ(dm_oblock_t b)
 501 {
 502         return to_oblock(from_oblock(b) + 1ull);
 503 }
 504
 505 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
 506 {
 507         key->virtual = 0;
 508         key->dev = 0;
 509         key->block_begin = from_oblock(begin);
 510         key->block_end = from_oblock(end);
 511 }
 512
 513 /*
 514  * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
 515  * level 1 which prevents *both* READs and WRITEs.
 516  */
 517 #define WRITE_LOCK_LEVEL 0
 518 #define READ_WRITE_LOCK_LEVEL 1
 519
 520 static unsigned int lock_level(struct bio *bio)
 521 {
 522         return bio_data_dir(bio) == WRITE ?
 523                 WRITE_LOCK_LEVEL :
 524                 READ_WRITE_LOCK_LEVEL;
 525 }
 526
 527 /*
 528  *--------------------------------------------------------------
 529  * Per bio data
 530  *--------------------------------------------------------------
 531  */
 532
 533 static struct per_bio_data *get_per_bio_data(struct bio *bio)
 534 {
 535         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 536
 537         BUG_ON(!pb);
 538         return pb;
 539 }
 540
 541 static struct per_bio_data *init_per_bio_data(struct bio *bio)
 542 {
 543         struct per_bio_data *pb = get_per_bio_data(bio);
 544
 545         pb->tick = false;
 546         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 547         pb->cell = NULL;
 548         pb->len = 0;
 549
 550         return pb;
 551 }
 552
 553 /*----------------------------------------------------------------*/
 554
 555 static void defer_bio(struct cache *cache, struct bio *bio)
 556 {
 557         spin_lock_irq(&cache->lock);
 558         bio_list_add(&cache->deferred_bios, bio);
 559         spin_unlock_irq(&cache->lock);
 560
 561         wake_deferred_bio_worker(cache);
 562 }
 563
 564 static void defer_bios(struct cache *cache, struct bio_list *bios)
 565 {
 566         spin_lock_irq(&cache->lock);
 567         bio_list_merge_init(&cache->deferred_bios, bios);
 568         spin_unlock_irq(&cache->lock);
 569
 570         wake_deferred_bio_worker(cache);
 571 }
 572
 573 /*----------------------------------------------------------------*/
 574
 575 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
 576 {
 577         bool r;
 578         struct per_bio_data *pb;
 579         struct dm_cell_key_v2 key;
 580         dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
 581         struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
 582
 583         cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
 584
 585         build_key(oblock, end, &key);
 586         r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
 587         if (!r) {
 588                 /*
 589                  * Failed to get the lock.
 590                  */
 591                 free_prison_cell(cache, cell_prealloc);
 592                 return r;
 593         }
 594
 595         if (cell != cell_prealloc)
 596                 free_prison_cell(cache, cell_prealloc);
 597
 598         pb = get_per_bio_data(bio);
 599         pb->cell = cell;
 600
 601         return r;
 602 }
 603
 604 /*----------------------------------------------------------------*/
 605
 606 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 607 {
 608         return test_bit(from_cblock(b), cache->dirty_bitset);
 609 }
 610
 611 static void set_dirty(struct cache *cache, dm_cblock_t cblock)
 612 {
 613         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 614                 atomic_inc(&cache->nr_dirty);
 615                 policy_set_dirty(cache->policy, cblock);
 616         }
 617 }
 618
 619 /*
 620  * These two are called when setting after migrations to force the policy
 621  * and dirty bitset to be in sync.
 622  */
 623 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
 624 {
 625         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
 626                 atomic_inc(&cache->nr_dirty);
 627         policy_set_dirty(cache->policy, cblock);
 628 }
 629
 630 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
 631 {
 632         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 633                 if (atomic_dec_return(&cache->nr_dirty) == 0)
 634                         dm_table_event(cache->ti->table);
 635         }
 636
 637         policy_clear_dirty(cache->policy, cblock);
 638 }
 639
 640 /*----------------------------------------------------------------*/
 641
 642 static bool block_size_is_power_of_two(struct cache *cache)
 643 {
 644         return cache->sectors_per_block_shift >= 0;
 645 }
 646
 647 static dm_block_t block_div(dm_block_t b, uint32_t n)
 648 {
 649         do_div(b, n);
 650
 651         return b;
 652 }
 653
 654 static dm_block_t oblocks_per_dblock(struct cache *cache)
 655 {
 656         dm_block_t oblocks = cache->discard_block_size;
 657
 658         if (block_size_is_power_of_two(cache))
 659                 oblocks >>= cache->sectors_per_block_shift;
 660         else
 661                 oblocks = block_div(oblocks, cache->sectors_per_block);
 662
 663         return oblocks;
 664 }
 665
 666 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 667 {
 668         return to_dblock(block_div(from_oblock(oblock),
 669                                    oblocks_per_dblock(cache)));
 670 }
 671
 672 static void set_discard(struct cache *cache, dm_dblock_t b)
 673 {
 674         BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
 675         atomic_inc(&cache->stats.discard_count);
 676
 677         spin_lock_irq(&cache->lock);
 678         set_bit(from_dblock(b), cache->discard_bitset);
 679         spin_unlock_irq(&cache->lock);
 680 }
 681
 682 static void clear_discard(struct cache *cache, dm_dblock_t b)
 683 {
 684         spin_lock_irq(&cache->lock);
 685         clear_bit(from_dblock(b), cache->discard_bitset);
 686         spin_unlock_irq(&cache->lock);
 687 }
 688
 689 static bool is_discarded(struct cache *cache, dm_dblock_t b)
 690 {
 691         int r;
 692
 693         spin_lock_irq(&cache->lock);
 694         r = test_bit(from_dblock(b), cache->discard_bitset);
 695         spin_unlock_irq(&cache->lock);
 696
 697         return r;
 698 }
 699
 700 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 701 {
 702         int r;
 703
 704         spin_lock_irq(&cache->lock);
 705         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 706                      cache->discard_bitset);
 707         spin_unlock_irq(&cache->lock);
 708
 709         return r;
 710 }
 711
 712 /*
 713  * -------------------------------------------------------------
 714  * Remapping
 715  *--------------------------------------------------------------
 716  */
 717 static void remap_to_origin(struct cache *cache, struct bio *bio)
 718 {
 719         bio_set_dev(bio, cache->origin_dev->bdev);
 720 }
 721
 722 static void remap_to_cache(struct cache *cache, struct bio *bio,
 723                            dm_cblock_t cblock)
 724 {
 725         sector_t bi_sector = bio->bi_iter.bi_sector;
 726         sector_t block = from_cblock(cblock);
 727
 728         bio_set_dev(bio, cache->cache_dev->bdev);
 729         if (!block_size_is_power_of_two(cache))
 730                 bio->bi_iter.bi_sector =
 731                         (block * cache->sectors_per_block) +
 732                         sector_div(bi_sector, cache->sectors_per_block);
 733         else
 734                 bio->bi_iter.bi_sector =
 735                         (block << cache->sectors_per_block_shift) |
 736                         (bi_sector & (cache->sectors_per_block - 1));
 737 }
 738
 739 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 740 {
 741         struct per_bio_data *pb;
 742
 743         spin_lock_irq(&cache->lock);
 744         if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 745             bio_op(bio) != REQ_OP_DISCARD) {
 746                 pb = get_per_bio_data(bio);
 747                 pb->tick = true;
 748                 cache->need_tick_bio = false;
 749         }
 750         spin_unlock_irq(&cache->lock);
 751 }
 752
 753 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 754                                           dm_oblock_t oblock)
 755 {
 756         // FIXME: check_if_tick_bio_needed() is called way too much through this interface
 757         check_if_tick_bio_needed(cache, bio);
 758         remap_to_origin(cache, bio);
 759         if (bio_data_dir(bio) == WRITE)
 760                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 761 }
 762
 763 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 764                                  dm_oblock_t oblock, dm_cblock_t cblock)
 765 {
 766         check_if_tick_bio_needed(cache, bio);
 767         remap_to_cache(cache, bio, cblock);
 768         if (bio_data_dir(bio) == WRITE) {
 769                 set_dirty(cache, cblock);
 770                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 771         }
 772 }
 773
 774 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 775 {
 776         sector_t block_nr = bio->bi_iter.bi_sector;
 777
 778         if (!block_size_is_power_of_two(cache))
 779                 (void) sector_div(block_nr, cache->sectors_per_block);
 780         else
 781                 block_nr >>= cache->sectors_per_block_shift;
 782
 783         return to_oblock(block_nr);
 784 }
 785
 786 static bool accountable_bio(struct cache *cache, struct bio *bio)
 787 {
 788         return bio_op(bio) != REQ_OP_DISCARD;
 789 }
 790
 791 static void accounted_begin(struct cache *cache, struct bio *bio)
 792 {
 793         struct per_bio_data *pb;
 794
 795         if (accountable_bio(cache, bio)) {
 796                 pb = get_per_bio_data(bio);
 797                 pb->len = bio_sectors(bio);
 798                 dm_iot_io_begin(&cache->tracker, pb->len);
 799         }
 800 }
 801
 802 static void accounted_complete(struct cache *cache, struct bio *bio)
 803 {
 804         struct per_bio_data *pb = get_per_bio_data(bio);
 805
 806         dm_iot_io_end(&cache->tracker, pb->len);
 807 }
 808
 809 static void accounted_request(struct cache *cache, struct bio *bio)
 810 {
 811         accounted_begin(cache, bio);
 812         dm_submit_bio_remap(bio, NULL);
 813 }
 814
 815 static void issue_op(struct bio *bio, void *context)
 816 {
 817         struct cache *cache = context;
 818
 819         accounted_request(cache, bio);
 820 }
 821
 822 /*
 823  * When running in writethrough mode we need to send writes to clean blocks
 824  * to both the cache and origin devices.  Clone the bio and send them in parallel.
 825  */
 826 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
 827                                       dm_oblock_t oblock, dm_cblock_t cblock)
 828 {
 829         struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio,
 830                                                  GFP_NOIO, &cache->bs);
 831
 832         BUG_ON(!origin_bio);
 833
 834         bio_chain(origin_bio, bio);
 835
 836         if (bio_data_dir(origin_bio) == WRITE)
 837                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 838         submit_bio(origin_bio);
 839
 840         remap_to_cache(cache, bio, cblock);
 841 }
 842
 843 /*
 844  *--------------------------------------------------------------
 845  * Failure modes
 846  *--------------------------------------------------------------
 847  */
 848 static enum cache_metadata_mode get_cache_mode(struct cache *cache)
 849 {
 850         return cache->features.mode;
 851 }
 852
 853 static const char *cache_device_name(struct cache *cache)
 854 {
 855         return dm_table_device_name(cache->ti->table);
 856 }
 857
 858 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
 859 {
 860         static const char *descs[] = {
 861                 "write",
 862                 "read-only",
 863                 "fail"
 864         };
 865
 866         dm_table_event(cache->ti->table);
 867         DMINFO("%s: switching cache to %s mode",
 868                cache_device_name(cache), descs[(int)mode]);
 869 }
 870
 871 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
 872 {
 873         bool needs_check;
 874         enum cache_metadata_mode old_mode = get_cache_mode(cache);
 875
 876         if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
 877                 DMERR("%s: unable to read needs_check flag, setting failure mode.",
 878                       cache_device_name(cache));
 879                 new_mode = CM_FAIL;
 880         }
 881
 882         if (new_mode == CM_WRITE && needs_check) {
 883                 DMERR("%s: unable to switch cache to write mode until repaired.",
 884                       cache_device_name(cache));
 885                 if (old_mode != new_mode)
 886                         new_mode = old_mode;
 887                 else
 888                         new_mode = CM_READ_ONLY;
 889         }
 890
 891         /* Never move out of fail mode */
 892         if (old_mode == CM_FAIL)
 893                 new_mode = CM_FAIL;
 894
 895         switch (new_mode) {
 896         case CM_FAIL:
 897         case CM_READ_ONLY:
 898                 dm_cache_metadata_set_read_only(cache->cmd);
 899                 break;
 900
 901         case CM_WRITE:
 902                 dm_cache_metadata_set_read_write(cache->cmd);
 903                 break;
 904         }
 905
 906         cache->features.mode = new_mode;
 907
 908         if (new_mode != old_mode)
 909                 notify_mode_switch(cache, new_mode);
 910 }
 911
 912 static void abort_transaction(struct cache *cache)
 913 {
 914         const char *dev_name = cache_device_name(cache);
 915
 916         if (get_cache_mode(cache) >= CM_READ_ONLY)
 917                 return;
 918
 919         DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
 920         if (dm_cache_metadata_abort(cache->cmd)) {
 921                 DMERR("%s: failed to abort metadata transaction", dev_name);
 922                 set_cache_mode(cache, CM_FAIL);
 923         }
 924
 925         if (dm_cache_metadata_set_needs_check(cache->cmd)) {
 926                 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
 927                 set_cache_mode(cache, CM_FAIL);
 928         }
 929 }
 930
 931 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
 932 {
 933         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
 934                     cache_device_name(cache), op, r);
 935         abort_transaction(cache);
 936         set_cache_mode(cache, CM_READ_ONLY);
 937 }
 938
 939 /*----------------------------------------------------------------*/
 940
 941 static void load_stats(struct cache *cache)
 942 {
 943         struct dm_cache_statistics stats;
 944
 945         dm_cache_metadata_get_stats(cache->cmd, &stats);
 946         atomic_set(&cache->stats.read_hit, stats.read_hits);
 947         atomic_set(&cache->stats.read_miss, stats.read_misses);
 948         atomic_set(&cache->stats.write_hit, stats.write_hits);
 949         atomic_set(&cache->stats.write_miss, stats.write_misses);
 950 }
 951
 952 static void save_stats(struct cache *cache)
 953 {
 954         struct dm_cache_statistics stats;
 955
 956         if (get_cache_mode(cache) >= CM_READ_ONLY)
 957                 return;
 958
 959         stats.read_hits = atomic_read(&cache->stats.read_hit);
 960         stats.read_misses = atomic_read(&cache->stats.read_miss);
 961         stats.write_hits = atomic_read(&cache->stats.write_hit);
 962         stats.write_misses = atomic_read(&cache->stats.write_miss);
 963
 964         dm_cache_metadata_set_stats(cache->cmd, &stats);
 965 }
 966
 967 static void update_stats(struct cache_stats *stats, enum policy_operation op)
 968 {
 969         switch (op) {
 970         case POLICY_PROMOTE:
 971                 atomic_inc(&stats->promotion);
 972                 break;
 973
 974         case POLICY_DEMOTE:
 975                 atomic_inc(&stats->demotion);
 976                 break;
 977
 978         case POLICY_WRITEBACK:
 979                 atomic_inc(&stats->writeback);
 980                 break;
 981         }
 982 }
 983
 984 /*
 985  *---------------------------------------------------------------------
 986  * Migration processing
 987  *
 988  * Migration covers moving data from the origin device to the cache, or
 989  * vice versa.
 990  *---------------------------------------------------------------------
 991  */
 992 static void inc_io_migrations(struct cache *cache)
 993 {
 994         atomic_inc(&cache->nr_io_migrations);
 995 }
 996
 997 static void dec_io_migrations(struct cache *cache)
 998 {
 999         atomic_dec(&cache->nr_io_migrations);
1000 }
1001
1002 static bool discard_or_flush(struct bio *bio)
1003 {
1004         return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1005 }
1006
1007 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1008                                      dm_dblock_t *b, dm_dblock_t *e)
1009 {
1010         sector_t sb = bio->bi_iter.bi_sector;
1011         sector_t se = bio_end_sector(bio);
1012
1013         *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1014
1015         if (se - sb < cache->discard_block_size)
1016                 *e = *b;
1017         else
1018                 *e = to_dblock(block_div(se, cache->discard_block_size));
1019 }
1020
1021 /*----------------------------------------------------------------*/
1022
1023 static void prevent_background_work(struct cache *cache)
1024 {
1025         lockdep_off();
1026         down_write(&cache->background_work_lock);
1027         lockdep_on();
1028 }
1029
1030 static void allow_background_work(struct cache *cache)
1031 {
1032         lockdep_off();
1033         up_write(&cache->background_work_lock);
1034         lockdep_on();
1035 }
1036
1037 static bool background_work_begin(struct cache *cache)
1038 {
1039         bool r;
1040
1041         lockdep_off();
1042         r = down_read_trylock(&cache->background_work_lock);
1043         lockdep_on();
1044
1045         return r;
1046 }
1047
1048 static void background_work_end(struct cache *cache)
1049 {
1050         lockdep_off();
1051         up_read(&cache->background_work_lock);
1052         lockdep_on();
1053 }
1054
1055 /*----------------------------------------------------------------*/
1056
1057 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1058 {
1059         return (bio_data_dir(bio) == WRITE) &&
1060                 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1061 }
1062
1063 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1064 {
1065         return writeback_mode(cache) &&
1066                 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1067 }
1068
1069 static void quiesce(struct dm_cache_migration *mg,
1070                     void (*continuation)(struct work_struct *))
1071 {
1072         init_continuation(&mg->k, continuation);
1073         dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1074 }
1075
1076 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1077 {
1078         struct continuation *k = container_of(ws, struct continuation, ws);
1079
1080         return container_of(k, struct dm_cache_migration, k);
1081 }
1082
1083 static void copy_complete(int read_err, unsigned long write_err, void *context)
1084 {
1085         struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1086
1087         if (read_err || write_err)
1088                 mg->k.input = BLK_STS_IOERR;
1089
1090         queue_continuation(mg->cache->wq, &mg->k);
1091 }
1092
1093 static void copy(struct dm_cache_migration *mg, bool promote)
1094 {
1095         struct dm_io_region o_region, c_region;
1096         struct cache *cache = mg->cache;
1097
1098         o_region.bdev = cache->origin_dev->bdev;
1099         o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1100         o_region.count = cache->sectors_per_block;
1101
1102         c_region.bdev = cache->cache_dev->bdev;
1103         c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1104         c_region.count = cache->sectors_per_block;
1105
1106         if (promote)
1107                 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1108         else
1109                 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1110 }
1111
1112 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1113 {
1114         struct per_bio_data *pb = get_per_bio_data(bio);
1115
1116         if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1117                 free_prison_cell(cache, pb->cell);
1118         pb->cell = NULL;
1119 }
1120
1121 static void overwrite_endio(struct bio *bio)
1122 {
1123         struct dm_cache_migration *mg = bio->bi_private;
1124         struct cache *cache = mg->cache;
1125         struct per_bio_data *pb = get_per_bio_data(bio);
1126
1127         dm_unhook_bio(&pb->hook_info, bio);
1128
1129         if (bio->bi_status)
1130                 mg->k.input = bio->bi_status;
1131
1132         queue_continuation(cache->wq, &mg->k);
1133 }
1134
1135 static void overwrite(struct dm_cache_migration *mg,
1136                       void (*continuation)(struct work_struct *))
1137 {
1138         struct bio *bio = mg->overwrite_bio;
1139         struct per_bio_data *pb = get_per_bio_data(bio);
1140
1141         dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1142
1143         /*
1144          * The overwrite bio is part of the copy operation, as such it does
1145          * not set/clear discard or dirty flags.
1146          */
1147         if (mg->op->op == POLICY_PROMOTE)
1148                 remap_to_cache(mg->cache, bio, mg->op->cblock);
1149         else
1150                 remap_to_origin(mg->cache, bio);
1151
1152         init_continuation(&mg->k, continuation);
1153         accounted_request(mg->cache, bio);
1154 }
1155
1156 /*
1157  * Migration steps:
1158  *
1159  * 1) exclusive lock preventing WRITEs
1160  * 2) quiesce
1161  * 3) copy or issue overwrite bio
1162  * 4) upgrade to exclusive lock preventing READs and WRITEs
1163  * 5) quiesce
1164  * 6) update metadata and commit
1165  * 7) unlock
1166  */
1167 static void mg_complete(struct dm_cache_migration *mg, bool success)
1168 {
1169         struct bio_list bios;
1170         struct cache *cache = mg->cache;
1171         struct policy_work *op = mg->op;
1172         dm_cblock_t cblock = op->cblock;
1173
1174         if (success)
1175                 update_stats(&cache->stats, op->op);
1176
1177         switch (op->op) {
1178         case POLICY_PROMOTE:
1179                 clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1180                 policy_complete_background_work(cache->policy, op, success);
1181
1182                 if (mg->overwrite_bio) {
1183                         if (success)
1184                                 force_set_dirty(cache, cblock);
1185                         else if (mg->k.input)
1186                                 mg->overwrite_bio->bi_status = mg->k.input;
1187                         else
1188                                 mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1189                         bio_endio(mg->overwrite_bio);
1190                 } else {
1191                         if (success)
1192                                 force_clear_dirty(cache, cblock);
1193                         dec_io_migrations(cache);
1194                 }
1195                 break;
1196
1197         case POLICY_DEMOTE:
1198                 /*
1199                  * We clear dirty here to update the nr_dirty counter.
1200                  */
1201                 if (success)
1202                         force_clear_dirty(cache, cblock);
1203                 policy_complete_background_work(cache->policy, op, success);
1204                 dec_io_migrations(cache);
1205                 break;
1206
1207         case POLICY_WRITEBACK:
1208                 if (success)
1209                         force_clear_dirty(cache, cblock);
1210                 policy_complete_background_work(cache->policy, op, success);
1211                 dec_io_migrations(cache);
1212                 break;
1213         }
1214
1215         bio_list_init(&bios);
1216         if (mg->cell) {
1217                 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1218                         free_prison_cell(cache, mg->cell);
1219         }
1220
1221         free_migration(mg);
1222         defer_bios(cache, &bios);
1223         wake_migration_worker(cache);
1224
1225         background_work_end(cache);
1226 }
1227
1228 static void mg_success(struct work_struct *ws)
1229 {
1230         struct dm_cache_migration *mg = ws_to_mg(ws);
1231
1232         mg_complete(mg, mg->k.input == 0);
1233 }
1234
1235 static void mg_update_metadata(struct work_struct *ws)
1236 {
1237         int r;
1238         struct dm_cache_migration *mg = ws_to_mg(ws);
1239         struct cache *cache = mg->cache;
1240         struct policy_work *op = mg->op;
1241
1242         switch (op->op) {
1243         case POLICY_PROMOTE:
1244                 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1245                 if (r) {
1246                         DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1247                                     cache_device_name(cache));
1248                         metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1249
1250                         mg_complete(mg, false);
1251                         return;
1252                 }
1253                 mg_complete(mg, true);
1254                 break;
1255
1256         case POLICY_DEMOTE:
1257                 r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1258                 if (r) {
1259                         DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1260                                     cache_device_name(cache));
1261                         metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1262
1263                         mg_complete(mg, false);
1264                         return;
1265                 }
1266
1267                 /*
1268                  * It would be nice if we only had to commit when a REQ_FLUSH
1269                  * comes through.  But there's one scenario that we have to
1270                  * look out for:
1271                  *
1272                  * - vblock x in a cache block
1273                  * - domotion occurs
1274                  * - cache block gets reallocated and over written
1275                  * - crash
1276                  *
1277                  * When we recover, because there was no commit the cache will
1278                  * rollback to having the data for vblock x in the cache block.
1279                  * But the cache block has since been overwritten, so it'll end
1280                  * up pointing to data that was never in 'x' during the history
1281                  * of the device.
1282                  *
1283                  * To avoid this issue we require a commit as part of the
1284                  * demotion operation.
1285                  */
1286                 init_continuation(&mg->k, mg_success);
1287                 continue_after_commit(&cache->committer, &mg->k);
1288                 schedule_commit(&cache->committer);
1289                 break;
1290
1291         case POLICY_WRITEBACK:
1292                 mg_complete(mg, true);
1293                 break;
1294         }
1295 }
1296
1297 static void mg_update_metadata_after_copy(struct work_struct *ws)
1298 {
1299         struct dm_cache_migration *mg = ws_to_mg(ws);
1300
1301         /*
1302          * Did the copy succeed?
1303          */
1304         if (mg->k.input)
1305                 mg_complete(mg, false);
1306         else
1307                 mg_update_metadata(ws);
1308 }
1309
1310 static void mg_upgrade_lock(struct work_struct *ws)
1311 {
1312         int r;
1313         struct dm_cache_migration *mg = ws_to_mg(ws);
1314
1315         /*
1316          * Did the copy succeed?
1317          */
1318         if (mg->k.input)
1319                 mg_complete(mg, false);
1320
1321         else {
1322                 /*
1323                  * Now we want the lock to prevent both reads and writes.
1324                  */
1325                 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1326                                             READ_WRITE_LOCK_LEVEL);
1327                 if (r < 0)
1328                         mg_complete(mg, false);
1329
1330                 else if (r)
1331                         quiesce(mg, mg_update_metadata);
1332
1333                 else
1334                         mg_update_metadata(ws);
1335         }
1336 }
1337
1338 static void mg_full_copy(struct work_struct *ws)
1339 {
1340         struct dm_cache_migration *mg = ws_to_mg(ws);
1341         struct cache *cache = mg->cache;
1342         struct policy_work *op = mg->op;
1343         bool is_policy_promote = (op->op == POLICY_PROMOTE);
1344
1345         if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1346             is_discarded_oblock(cache, op->oblock)) {
1347                 mg_upgrade_lock(ws);
1348                 return;
1349         }
1350
1351         init_continuation(&mg->k, mg_upgrade_lock);
1352         copy(mg, is_policy_promote);
1353 }
1354
1355 static void mg_copy(struct work_struct *ws)
1356 {
1357         struct dm_cache_migration *mg = ws_to_mg(ws);
1358
1359         if (mg->overwrite_bio) {
1360                 /*
1361                  * No exclusive lock was held when we last checked if the bio
1362                  * was optimisable.  So we have to check again in case things
1363                  * have changed (eg, the block may no longer be discarded).
1364                  */
1365                 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
1366                         /*
1367                          * Fallback to a real full copy after doing some tidying up.
1368                          */
1369                         bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
1370
1371                         BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
1372                         mg->overwrite_bio = NULL;
1373                         inc_io_migrations(mg->cache);
1374                         mg_full_copy(ws);
1375                         return;
1376                 }
1377
1378                 /*
1379                  * It's safe to do this here, even though it's new data
1380                  * because all IO has been locked out of the block.
1381                  *
1382                  * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1383                  * so _not_ using mg_upgrade_lock() as continutation.
1384                  */
1385                 overwrite(mg, mg_update_metadata_after_copy);
1386
1387         } else
1388                 mg_full_copy(ws);
1389 }
1390
1391 static int mg_lock_writes(struct dm_cache_migration *mg)
1392 {
1393         int r;
1394         struct dm_cell_key_v2 key;
1395         struct cache *cache = mg->cache;
1396         struct dm_bio_prison_cell_v2 *prealloc;
1397
1398         prealloc = alloc_prison_cell(cache);
1399
1400         /*
1401          * Prevent writes to the block, but allow reads to continue.
1402          * Unless we're using an overwrite bio, in which case we lock
1403          * everything.
1404          */
1405         build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1406         r = dm_cell_lock_v2(cache->prison, &key,
1407                             mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1408                             prealloc, &mg->cell);
1409         if (r < 0) {
1410                 free_prison_cell(cache, prealloc);
1411                 mg_complete(mg, false);
1412                 return r;
1413         }
1414
1415         if (mg->cell != prealloc)
1416                 free_prison_cell(cache, prealloc);
1417
1418         if (r == 0)
1419                 mg_copy(&mg->k.ws);
1420         else
1421                 quiesce(mg, mg_copy);
1422
1423         return 0;
1424 }
1425
1426 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1427 {
1428         struct dm_cache_migration *mg;
1429
1430         if (!background_work_begin(cache)) {
1431                 policy_complete_background_work(cache->policy, op, false);
1432                 return -EPERM;
1433         }
1434
1435         mg = alloc_migration(cache);
1436
1437         mg->op = op;
1438         mg->overwrite_bio = bio;
1439
1440         if (!bio)
1441                 inc_io_migrations(cache);
1442
1443         return mg_lock_writes(mg);
1444 }
1445
1446 /*
1447  *--------------------------------------------------------------
1448  * invalidation processing
1449  *--------------------------------------------------------------
1450  */
1451
1452 static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1453 {
1454         struct bio_list bios;
1455         struct cache *cache = mg->cache;
1456
1457         bio_list_init(&bios);
1458         if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1459                 free_prison_cell(cache, mg->cell);
1460
1461         if (!success && mg->overwrite_bio)
1462                 bio_io_error(mg->overwrite_bio);
1463
1464         free_migration(mg);
1465         defer_bios(cache, &bios);
1466
1467         background_work_end(cache);
1468 }
1469
1470 static void invalidate_completed(struct work_struct *ws)
1471 {
1472         struct dm_cache_migration *mg = ws_to_mg(ws);
1473
1474         invalidate_complete(mg, !mg->k.input);
1475 }
1476
1477 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1478 {
1479         int r;
1480
1481         r = policy_invalidate_mapping(cache->policy, cblock);
1482         if (!r) {
1483                 r = dm_cache_remove_mapping(cache->cmd, cblock);
1484                 if (r) {
1485                         DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1486                                     cache_device_name(cache));
1487                         metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1488                 }
1489
1490         } else if (r == -ENODATA) {
1491                 /*
1492                  * Harmless, already unmapped.
1493                  */
1494                 r = 0;
1495
1496         } else
1497                 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1498
1499         return r;
1500 }
1501
1502 static void invalidate_remove(struct work_struct *ws)
1503 {
1504         int r;
1505         struct dm_cache_migration *mg = ws_to_mg(ws);
1506         struct cache *cache = mg->cache;
1507
1508         r = invalidate_cblock(cache, mg->invalidate_cblock);
1509         if (r) {
1510                 invalidate_complete(mg, false);
1511                 return;
1512         }
1513
1514         init_continuation(&mg->k, invalidate_completed);
1515         continue_after_commit(&cache->committer, &mg->k);
1516         remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1517         mg->overwrite_bio = NULL;
1518         schedule_commit(&cache->committer);
1519 }
1520
1521 static int invalidate_lock(struct dm_cache_migration *mg)
1522 {
1523         int r;
1524         struct dm_cell_key_v2 key;
1525         struct cache *cache = mg->cache;
1526         struct dm_bio_prison_cell_v2 *prealloc;
1527
1528         prealloc = alloc_prison_cell(cache);
1529
1530         build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1531         r = dm_cell_lock_v2(cache->prison, &key,
1532                             READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1533         if (r < 0) {
1534                 free_prison_cell(cache, prealloc);
1535                 invalidate_complete(mg, false);
1536                 return r;
1537         }
1538
1539         if (mg->cell != prealloc)
1540                 free_prison_cell(cache, prealloc);
1541
1542         if (r)
1543                 quiesce(mg, invalidate_remove);
1544
1545         else {
1546                 /*
1547                  * We can't call invalidate_remove() directly here because we
1548                  * might still be in request context.
1549                  */
1550                 init_continuation(&mg->k, invalidate_remove);
1551                 queue_work(cache->wq, &mg->k.ws);
1552         }
1553
1554         return 0;
1555 }
1556
1557 static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1558                             dm_oblock_t oblock, struct bio *bio)
1559 {
1560         struct dm_cache_migration *mg;
1561
1562         if (!background_work_begin(cache))
1563                 return -EPERM;
1564
1565         mg = alloc_migration(cache);
1566
1567         mg->overwrite_bio = bio;
1568         mg->invalidate_cblock = cblock;
1569         mg->invalidate_oblock = oblock;
1570
1571         return invalidate_lock(mg);
1572 }
1573
1574 /*
1575  *--------------------------------------------------------------
1576  * bio processing
1577  *--------------------------------------------------------------
1578  */
1579
1580 enum busy {
1581         IDLE,
1582         BUSY
1583 };
1584
1585 static enum busy spare_migration_bandwidth(struct cache *cache)
1586 {
1587         bool idle = dm_iot_idle_for(&cache->tracker, HZ);
1588         sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1589                 cache->sectors_per_block;
1590
1591         if (idle && current_volume <= cache->migration_threshold)
1592                 return IDLE;
1593         else
1594                 return BUSY;
1595 }
1596
1597 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1598 {
1599         atomic_inc(bio_data_dir(bio) == READ ?
1600                    &cache->stats.read_hit : &cache->stats.write_hit);
1601 }
1602
1603 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1604 {
1605         atomic_inc(bio_data_dir(bio) == READ ?
1606                    &cache->stats.read_miss : &cache->stats.write_miss);
1607 }
1608
1609 /*----------------------------------------------------------------*/
1610
1611 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1612                    bool *commit_needed)
1613 {
1614         int r, data_dir;
1615         bool rb, background_queued;
1616         dm_cblock_t cblock;
1617
1618         *commit_needed = false;
1619
1620         rb = bio_detain_shared(cache, block, bio);
1621         if (!rb) {
1622                 /*
1623                  * An exclusive lock is held for this block, so we have to
1624                  * wait.  We set the commit_needed flag so the current
1625                  * transaction will be committed asap, allowing this lock
1626                  * to be dropped.
1627                  */
1628                 *commit_needed = true;
1629                 return DM_MAPIO_SUBMITTED;
1630         }
1631
1632         data_dir = bio_data_dir(bio);
1633
1634         if (optimisable_bio(cache, bio, block)) {
1635                 struct policy_work *op = NULL;
1636
1637                 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1638                 if (unlikely(r && r != -ENOENT)) {
1639                         DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1640                                     cache_device_name(cache), r);
1641                         bio_io_error(bio);
1642                         return DM_MAPIO_SUBMITTED;
1643                 }
1644
1645                 if (r == -ENOENT && op) {
1646                         bio_drop_shared_lock(cache, bio);
1647                         BUG_ON(op->op != POLICY_PROMOTE);
1648                         mg_start(cache, op, bio);
1649                         return DM_MAPIO_SUBMITTED;
1650                 }
1651         } else {
1652                 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1653                 if (unlikely(r && r != -ENOENT)) {
1654                         DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1655                                     cache_device_name(cache), r);
1656                         bio_io_error(bio);
1657                         return DM_MAPIO_SUBMITTED;
1658                 }
1659
1660                 if (background_queued)
1661                         wake_migration_worker(cache);
1662         }
1663
1664         if (r == -ENOENT) {
1665                 struct per_bio_data *pb = get_per_bio_data(bio);
1666
1667                 /*
1668                  * Miss.
1669                  */
1670                 inc_miss_counter(cache, bio);
1671                 if (pb->req_nr == 0) {
1672                         accounted_begin(cache, bio);
1673                         remap_to_origin_clear_discard(cache, bio, block);
1674                 } else {
1675                         /*
1676                          * This is a duplicate writethrough io that is no
1677                          * longer needed because the block has been demoted.
1678                          */
1679                         bio_endio(bio);
1680                         return DM_MAPIO_SUBMITTED;
1681                 }
1682         } else {
1683                 /*
1684                  * Hit.
1685                  */
1686                 inc_hit_counter(cache, bio);
1687
1688                 /*
1689                  * Passthrough always maps to the origin, invalidating any
1690                  * cache blocks that are written to.
1691                  */
1692                 if (passthrough_mode(cache)) {
1693                         if (bio_data_dir(bio) == WRITE) {
1694                                 bio_drop_shared_lock(cache, bio);
1695                                 atomic_inc(&cache->stats.demotion);
1696                                 invalidate_start(cache, cblock, block, bio);
1697                         } else
1698                                 remap_to_origin_clear_discard(cache, bio, block);
1699                 } else {
1700                         if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1701                             !is_dirty(cache, cblock)) {
1702                                 remap_to_origin_and_cache(cache, bio, block, cblock);
1703                                 accounted_begin(cache, bio);
1704                         } else
1705                                 remap_to_cache_dirty(cache, bio, block, cblock);
1706                 }
1707         }
1708
1709         /*
1710          * dm core turns FUA requests into a separate payload and FLUSH req.
1711          */
1712         if (bio->bi_opf & REQ_FUA) {
1713                 /*
1714                  * issue_after_commit will call accounted_begin a second time.  So
1715                  * we call accounted_complete() to avoid double accounting.
1716                  */
1717                 accounted_complete(cache, bio);
1718                 issue_after_commit(&cache->committer, bio);
1719                 *commit_needed = true;
1720                 return DM_MAPIO_SUBMITTED;
1721         }
1722
1723         return DM_MAPIO_REMAPPED;
1724 }
1725
1726 static bool process_bio(struct cache *cache, struct bio *bio)
1727 {
1728         bool commit_needed;
1729
1730         if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1731                 dm_submit_bio_remap(bio, NULL);
1732
1733         return commit_needed;
1734 }
1735
1736 /*
1737  * A non-zero return indicates read_only or fail_io mode.
1738  */
1739 static int commit(struct cache *cache, bool clean_shutdown)
1740 {
1741         int r;
1742
1743         if (get_cache_mode(cache) >= CM_READ_ONLY)
1744                 return -EINVAL;
1745
1746         atomic_inc(&cache->stats.commit_count);
1747         r = dm_cache_commit(cache->cmd, clean_shutdown);
1748         if (r)
1749                 metadata_operation_failed(cache, "dm_cache_commit", r);
1750
1751         return r;
1752 }
1753
1754 /*
1755  * Used by the batcher.
1756  */
1757 static blk_status_t commit_op(void *context)
1758 {
1759         struct cache *cache = context;
1760
1761         if (dm_cache_changed_this_transaction(cache->cmd))
1762                 return errno_to_blk_status(commit(cache, false));
1763
1764         return 0;
1765 }
1766
1767 /*----------------------------------------------------------------*/
1768
1769 static bool process_flush_bio(struct cache *cache, struct bio *bio)
1770 {
1771         struct per_bio_data *pb = get_per_bio_data(bio);
1772
1773         if (!pb->req_nr)
1774                 remap_to_origin(cache, bio);
1775         else
1776                 remap_to_cache(cache, bio, 0);
1777
1778         issue_after_commit(&cache->committer, bio);
1779         return true;
1780 }
1781
1782 static bool process_discard_bio(struct cache *cache, struct bio *bio)
1783 {
1784         dm_dblock_t b, e;
1785
1786         /*
1787          * FIXME: do we need to lock the region?  Or can we just assume the
1788          * user wont be so foolish as to issue discard concurrently with
1789          * other IO?
1790          */
1791         calc_discard_block_range(cache, bio, &b, &e);
1792         while (b != e) {
1793                 set_discard(cache, b);
1794                 b = to_dblock(from_dblock(b) + 1);
1795         }
1796
1797         if (cache->features.discard_passdown) {
1798                 remap_to_origin(cache, bio);
1799                 dm_submit_bio_remap(bio, NULL);
1800         } else
1801                 bio_endio(bio);
1802
1803         return false;
1804 }
1805
1806 static void process_deferred_bios(struct work_struct *ws)
1807 {
1808         struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1809
1810         bool commit_needed = false;
1811         struct bio_list bios;
1812         struct bio *bio;
1813
1814         bio_list_init(&bios);
1815
1816         spin_lock_irq(&cache->lock);
1817         bio_list_merge_init(&bios, &cache->deferred_bios);
1818         spin_unlock_irq(&cache->lock);
1819
1820         while ((bio = bio_list_pop(&bios))) {
1821                 if (bio->bi_opf & REQ_PREFLUSH)
1822                         commit_needed = process_flush_bio(cache, bio) || commit_needed;
1823
1824                 else if (bio_op(bio) == REQ_OP_DISCARD)
1825                         commit_needed = process_discard_bio(cache, bio) || commit_needed;
1826
1827                 else
1828                         commit_needed = process_bio(cache, bio) || commit_needed;
1829                 cond_resched();
1830         }
1831
1832         if (commit_needed)
1833                 schedule_commit(&cache->committer);
1834 }
1835
1836 /*
1837  *--------------------------------------------------------------
1838  * Main worker loop
1839  *--------------------------------------------------------------
1840  */
1841 static void requeue_deferred_bios(struct cache *cache)
1842 {
1843         struct bio *bio;
1844         struct bio_list bios;
1845
1846         bio_list_init(&bios);
1847         bio_list_merge_init(&bios, &cache->deferred_bios);
1848
1849         while ((bio = bio_list_pop(&bios))) {
1850                 bio->bi_status = BLK_STS_DM_REQUEUE;
1851                 bio_endio(bio);
1852                 cond_resched();
1853         }
1854 }
1855
1856 /*
1857  * We want to commit periodically so that not too much
1858  * unwritten metadata builds up.
1859  */
1860 static void do_waker(struct work_struct *ws)
1861 {
1862         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1863
1864         policy_tick(cache->policy, true);
1865         wake_migration_worker(cache);
1866         schedule_commit(&cache->committer);
1867         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1868 }
1869
1870 static void check_migrations(struct work_struct *ws)
1871 {
1872         int r;
1873         struct policy_work *op;
1874         struct cache *cache = container_of(ws, struct cache, migration_worker);
1875         enum busy b;
1876
1877         for (;;) {
1878                 b = spare_migration_bandwidth(cache);
1879
1880                 r = policy_get_background_work(cache->policy, b == IDLE, &op);
1881                 if (r == -ENODATA)
1882                         break;
1883
1884                 if (r) {
1885                         DMERR_LIMIT("%s: policy_background_work failed",
1886                                     cache_device_name(cache));
1887                         break;
1888                 }
1889
1890                 r = mg_start(cache, op, NULL);
1891                 if (r)
1892                         break;
1893
1894                 cond_resched();
1895         }
1896 }
1897
1898 /*
1899  *--------------------------------------------------------------
1900  * Target methods
1901  *--------------------------------------------------------------
1902  */
1903
1904 /*
1905  * This function gets called on the error paths of the constructor, so we
1906  * have to cope with a partially initialised struct.
1907  */
1908 static void destroy(struct cache *cache)
1909 {
1910         unsigned int i;
1911
1912         mempool_exit(&cache->migration_pool);
1913
1914         if (cache->prison)
1915                 dm_bio_prison_destroy_v2(cache->prison);
1916
1917         cancel_delayed_work_sync(&cache->waker);
1918         if (cache->wq)
1919                 destroy_workqueue(cache->wq);
1920
1921         if (cache->dirty_bitset)
1922                 free_bitset(cache->dirty_bitset);
1923
1924         if (cache->discard_bitset)
1925                 free_bitset(cache->discard_bitset);
1926
1927         if (cache->copier)
1928                 dm_kcopyd_client_destroy(cache->copier);
1929
1930         if (cache->cmd)
1931                 dm_cache_metadata_close(cache->cmd);
1932
1933         if (cache->metadata_dev)
1934                 dm_put_device(cache->ti, cache->metadata_dev);
1935
1936         if (cache->origin_dev)
1937                 dm_put_device(cache->ti, cache->origin_dev);
1938
1939         if (cache->cache_dev)
1940                 dm_put_device(cache->ti, cache->cache_dev);
1941
1942         if (cache->policy)
1943                 dm_cache_policy_destroy(cache->policy);
1944
1945         for (i = 0; i < cache->nr_ctr_args ; i++)
1946                 kfree(cache->ctr_args[i]);
1947         kfree(cache->ctr_args);
1948
1949         bioset_exit(&cache->bs);
1950
1951         kfree(cache);
1952 }
1953
1954 static void cache_dtr(struct dm_target *ti)
1955 {
1956         struct cache *cache = ti->private;
1957
1958         destroy(cache);
1959 }
1960
1961 static sector_t get_dev_size(struct dm_dev *dev)
1962 {
1963         return bdev_nr_sectors(dev->bdev);
1964 }
1965
1966 /*----------------------------------------------------------------*/
1967
1968 /*
1969  * Construct a cache device mapping.
1970  *
1971  * cache <metadata dev> <cache dev> <origin dev> <block size>
1972  *       <#feature args> [<feature arg>]*
1973  *       <policy> <#policy args> [<policy arg>]*
1974  *
1975  * metadata dev    : fast device holding the persistent metadata
1976  * cache dev       : fast device holding cached data blocks
1977  * origin dev      : slow device holding original data blocks
1978  * block size      : cache unit size in sectors
1979  *
1980  * #feature args   : number of feature arguments passed
1981  * feature args    : writethrough.  (The default is writeback.)
1982  *
1983  * policy          : the replacement policy to use
1984  * #policy args    : an even number of policy arguments corresponding
1985  *                   to key/value pairs passed to the policy
1986  * policy args     : key/value pairs passed to the policy
1987  *                   E.g. 'sequential_threshold 1024'
1988  *                   See cache-policies.txt for details.
1989  *
1990  * Optional feature arguments are:
1991  *   writethrough  : write through caching that prohibits cache block
1992  *                   content from being different from origin block content.
1993  *                   Without this argument, the default behaviour is to write
1994  *                   back cache block contents later for performance reasons,
1995  *                   so they may differ from the corresponding origin blocks.
1996  */
1997 struct cache_args {
1998         struct dm_target *ti;
1999
2000         struct dm_dev *metadata_dev;
2001
2002         struct dm_dev *cache_dev;
2003         sector_t cache_sectors;
2004
2005         struct dm_dev *origin_dev;
2006         sector_t origin_sectors;
2007
2008         uint32_t block_size;
2009
2010         const char *policy_name;
2011         int policy_argc;
2012         const char **policy_argv;
2013
2014         struct cache_features features;
2015 };
2016
2017 static void destroy_cache_args(struct cache_args *ca)
2018 {
2019         if (ca->metadata_dev)
2020                 dm_put_device(ca->ti, ca->metadata_dev);
2021
2022         if (ca->cache_dev)
2023                 dm_put_device(ca->ti, ca->cache_dev);
2024
2025         if (ca->origin_dev)
2026                 dm_put_device(ca->ti, ca->origin_dev);
2027
2028         kfree(ca);
2029 }
2030
2031 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2032 {
2033         if (!as->argc) {
2034                 *error = "Insufficient args";
2035                 return false;
2036         }
2037
2038         return true;
2039 }
2040
2041 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2042                               char **error)
2043 {
2044         int r;
2045         sector_t metadata_dev_size;
2046
2047         if (!at_least_one_arg(as, error))
2048                 return -EINVAL;
2049
2050         r = dm_get_device(ca->ti, dm_shift_arg(as),
2051                           BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev);
2052         if (r) {
2053                 *error = "Error opening metadata device";
2054                 return r;
2055         }
2056
2057         metadata_dev_size = get_dev_size(ca->metadata_dev);
2058         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2059                 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
2060                        ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS);
2061
2062         return 0;
2063 }
2064
2065 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2066                            char **error)
2067 {
2068         int r;
2069
2070         if (!at_least_one_arg(as, error))
2071                 return -EINVAL;
2072
2073         r = dm_get_device(ca->ti, dm_shift_arg(as),
2074                           BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev);
2075         if (r) {
2076                 *error = "Error opening cache device";
2077                 return r;
2078         }
2079         ca->cache_sectors = get_dev_size(ca->cache_dev);
2080
2081         return 0;
2082 }
2083
2084 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2085                             char **error)
2086 {
2087         int r;
2088
2089         if (!at_least_one_arg(as, error))
2090                 return -EINVAL;
2091
2092         r = dm_get_device(ca->ti, dm_shift_arg(as),
2093                           BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev);
2094         if (r) {
2095                 *error = "Error opening origin device";
2096                 return r;
2097         }
2098
2099         ca->origin_sectors = get_dev_size(ca->origin_dev);
2100         if (ca->ti->len > ca->origin_sectors) {
2101                 *error = "Device size larger than cached device";
2102                 return -EINVAL;
2103         }
2104
2105         return 0;
2106 }
2107
2108 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2109                             char **error)
2110 {
2111         unsigned long block_size;
2112
2113         if (!at_least_one_arg(as, error))
2114                 return -EINVAL;
2115
2116         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2117             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2118             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2119             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2120                 *error = "Invalid data block size";
2121                 return -EINVAL;
2122         }
2123
2124         if (block_size > ca->cache_sectors) {
2125                 *error = "Data block size is larger than the cache device";
2126                 return -EINVAL;
2127         }
2128
2129         ca->block_size = block_size;
2130
2131         return 0;
2132 }
2133
2134 static void init_features(struct cache_features *cf)
2135 {
2136         cf->mode = CM_WRITE;
2137         cf->io_mode = CM_IO_WRITEBACK;
2138         cf->metadata_version = 1;
2139         cf->discard_passdown = true;
2140 }
2141
2142 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2143                           char **error)
2144 {
2145         static const struct dm_arg _args[] = {
2146                 {0, 3, "Invalid number of cache feature arguments"},
2147         };
2148
2149         int r, mode_ctr = 0;
2150         unsigned int argc;
2151         const char *arg;
2152         struct cache_features *cf = &ca->features;
2153
2154         init_features(cf);
2155
2156         r = dm_read_arg_group(_args, as, &argc, error);
2157         if (r)
2158                 return -EINVAL;
2159
2160         while (argc--) {
2161                 arg = dm_shift_arg(as);
2162
2163                 if (!strcasecmp(arg, "writeback")) {
2164                         cf->io_mode = CM_IO_WRITEBACK;
2165                         mode_ctr++;
2166                 }
2167
2168                 else if (!strcasecmp(arg, "writethrough")) {
2169                         cf->io_mode = CM_IO_WRITETHROUGH;
2170                         mode_ctr++;
2171                 }
2172
2173                 else if (!strcasecmp(arg, "passthrough")) {
2174                         cf->io_mode = CM_IO_PASSTHROUGH;
2175                         mode_ctr++;
2176                 }
2177
2178                 else if (!strcasecmp(arg, "metadata2"))
2179                         cf->metadata_version = 2;
2180
2181                 else if (!strcasecmp(arg, "no_discard_passdown"))
2182                         cf->discard_passdown = false;
2183
2184                 else {
2185                         *error = "Unrecognised cache feature requested";
2186                         return -EINVAL;
2187                 }
2188         }
2189
2190         if (mode_ctr > 1) {
2191                 *error = "Duplicate cache io_mode features requested";
2192                 return -EINVAL;
2193         }
2194
2195         return 0;
2196 }
2197
2198 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2199                         char **error)
2200 {
2201         static const struct dm_arg _args[] = {
2202                 {0, 1024, "Invalid number of policy arguments"},
2203         };
2204
2205         int r;
2206
2207         if (!at_least_one_arg(as, error))
2208                 return -EINVAL;
2209
2210         ca->policy_name = dm_shift_arg(as);
2211
2212         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2213         if (r)
2214                 return -EINVAL;
2215
2216         ca->policy_argv = (const char **)as->argv;
2217         dm_consume_args(as, ca->policy_argc);
2218
2219         return 0;
2220 }
2221
2222 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2223                             char **error)
2224 {
2225         int r;
2226         struct dm_arg_set as;
2227
2228         as.argc = argc;
2229         as.argv = argv;
2230
2231         r = parse_metadata_dev(ca, &as, error);
2232         if (r)
2233                 return r;
2234
2235         r = parse_cache_dev(ca, &as, error);
2236         if (r)
2237                 return r;
2238
2239         r = parse_origin_dev(ca, &as, error);
2240         if (r)
2241                 return r;
2242
2243         r = parse_block_size(ca, &as, error);
2244         if (r)
2245                 return r;
2246
2247         r = parse_features(ca, &as, error);
2248         if (r)
2249                 return r;
2250
2251         r = parse_policy(ca, &as, error);
2252         if (r)
2253                 return r;
2254
2255         return 0;
2256 }
2257
2258 /*----------------------------------------------------------------*/
2259
2260 static struct kmem_cache *migration_cache;
2261
2262 #define NOT_CORE_OPTION 1
2263
2264 static int process_config_option(struct cache *cache, const char *key, const char *value)
2265 {
2266         unsigned long tmp;
2267
2268         if (!strcasecmp(key, "migration_threshold")) {
2269                 if (kstrtoul(value, 10, &tmp))
2270                         return -EINVAL;
2271
2272                 cache->migration_threshold = tmp;
2273                 return 0;
2274         }
2275
2276         return NOT_CORE_OPTION;
2277 }
2278
2279 static int set_config_value(struct cache *cache, const char *key, const char *value)
2280 {
2281         int r = process_config_option(cache, key, value);
2282
2283         if (r == NOT_CORE_OPTION)
2284                 r = policy_set_config_value(cache->policy, key, value);
2285
2286         if (r)
2287                 DMWARN("bad config value for %s: %s", key, value);
2288
2289         return r;
2290 }
2291
2292 static int set_config_values(struct cache *cache, int argc, const char **argv)
2293 {
2294         int r = 0;
2295
2296         if (argc & 1) {
2297                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2298                 return -EINVAL;
2299         }
2300
2301         while (argc) {
2302                 r = set_config_value(cache, argv[0], argv[1]);
2303                 if (r)
2304                         break;
2305
2306                 argc -= 2;
2307                 argv += 2;
2308         }
2309
2310         return r;
2311 }
2312
2313 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2314                                char **error)
2315 {
2316         struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2317                                                            cache->cache_size,
2318                                                            cache->origin_sectors,
2319                                                            cache->sectors_per_block);
2320         if (IS_ERR(p)) {
2321                 *error = "Error creating cache's policy";
2322                 return PTR_ERR(p);
2323         }
2324         cache->policy = p;
2325         BUG_ON(!cache->policy);
2326
2327         return 0;
2328 }
2329
2330 /*
2331  * We want the discard block size to be at least the size of the cache
2332  * block size and have no more than 2^14 discard blocks across the origin.
2333  */
2334 #define MAX_DISCARD_BLOCKS (1 << 14)
2335
2336 static bool too_many_discard_blocks(sector_t discard_block_size,
2337                                     sector_t origin_size)
2338 {
2339         (void) sector_div(origin_size, discard_block_size);
2340
2341         return origin_size > MAX_DISCARD_BLOCKS;
2342 }
2343
2344 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2345                                              sector_t origin_size)
2346 {
2347         sector_t discard_block_size = cache_block_size;
2348
2349         if (origin_size)
2350                 while (too_many_discard_blocks(discard_block_size, origin_size))
2351                         discard_block_size *= 2;
2352
2353         return discard_block_size;
2354 }
2355
2356 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2357 {
2358         dm_block_t nr_blocks = from_cblock(size);
2359
2360         if (nr_blocks > (1 << 20) && cache->cache_size != size)
2361                 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2362                              "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2363                              "Please consider increasing the cache block size to reduce the overall cache block count.",
2364                              (unsigned long long) nr_blocks);
2365
2366         cache->cache_size = size;
2367 }
2368
2369 #define DEFAULT_MIGRATION_THRESHOLD 2048
2370
2371 static int cache_create(struct cache_args *ca, struct cache **result)
2372 {
2373         int r = 0;
2374         char **error = &ca->ti->error;
2375         struct cache *cache;
2376         struct dm_target *ti = ca->ti;
2377         dm_block_t origin_blocks;
2378         struct dm_cache_metadata *cmd;
2379         bool may_format = ca->features.mode == CM_WRITE;
2380
2381         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2382         if (!cache)
2383                 return -ENOMEM;
2384
2385         cache->ti = ca->ti;
2386         ti->private = cache;
2387         ti->accounts_remapped_io = true;
2388         ti->num_flush_bios = 2;
2389         ti->flush_supported = true;
2390
2391         ti->num_discard_bios = 1;
2392         ti->discards_supported = true;
2393
2394         ti->per_io_data_size = sizeof(struct per_bio_data);
2395
2396         cache->features = ca->features;
2397         if (writethrough_mode(cache)) {
2398                 /* Create bioset for writethrough bios issued to origin */
2399                 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
2400                 if (r)
2401                         goto bad;
2402         }
2403
2404         cache->metadata_dev = ca->metadata_dev;
2405         cache->origin_dev = ca->origin_dev;
2406         cache->cache_dev = ca->cache_dev;
2407
2408         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2409
2410         origin_blocks = cache->origin_sectors = ca->origin_sectors;
2411         origin_blocks = block_div(origin_blocks, ca->block_size);
2412         cache->origin_blocks = to_oblock(origin_blocks);
2413
2414         cache->sectors_per_block = ca->block_size;
2415         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2416                 r = -EINVAL;
2417                 goto bad;
2418         }
2419
2420         if (ca->block_size & (ca->block_size - 1)) {
2421                 dm_block_t cache_size = ca->cache_sectors;
2422
2423                 cache->sectors_per_block_shift = -1;
2424                 cache_size = block_div(cache_size, ca->block_size);
2425                 set_cache_size(cache, to_cblock(cache_size));
2426         } else {
2427                 cache->sectors_per_block_shift = __ffs(ca->block_size);
2428                 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2429         }
2430
2431         r = create_cache_policy(cache, ca, error);
2432         if (r)
2433                 goto bad;
2434
2435         cache->policy_nr_args = ca->policy_argc;
2436         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2437
2438         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2439         if (r) {
2440                 *error = "Error setting cache policy's config values";
2441                 goto bad;
2442         }
2443
2444         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2445                                      ca->block_size, may_format,
2446                                      dm_cache_policy_get_hint_size(cache->policy),
2447                                      ca->features.metadata_version);
2448         if (IS_ERR(cmd)) {
2449                 *error = "Error creating metadata object";
2450                 r = PTR_ERR(cmd);
2451                 goto bad;
2452         }
2453         cache->cmd = cmd;
2454         set_cache_mode(cache, CM_WRITE);
2455         if (get_cache_mode(cache) != CM_WRITE) {
2456                 *error = "Unable to get write access to metadata, please check/repair metadata.";
2457                 r = -EINVAL;
2458                 goto bad;
2459         }
2460
2461         if (passthrough_mode(cache)) {
2462                 bool all_clean;
2463
2464                 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2465                 if (r) {
2466                         *error = "dm_cache_metadata_all_clean() failed";
2467                         goto bad;
2468                 }
2469
2470                 if (!all_clean) {
2471                         *error = "Cannot enter passthrough mode unless all blocks are clean";
2472                         r = -EINVAL;
2473                         goto bad;
2474                 }
2475
2476                 policy_allow_migrations(cache->policy, false);
2477         }
2478
2479         spin_lock_init(&cache->lock);
2480         bio_list_init(&cache->deferred_bios);
2481         atomic_set(&cache->nr_allocated_migrations, 0);
2482         atomic_set(&cache->nr_io_migrations, 0);
2483         init_waitqueue_head(&cache->migration_wait);
2484
2485         r = -ENOMEM;
2486         atomic_set(&cache->nr_dirty, 0);
2487         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2488         if (!cache->dirty_bitset) {
2489                 *error = "could not allocate dirty bitset";
2490                 goto bad;
2491         }
2492         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2493
2494         cache->discard_block_size =
2495                 calculate_discard_block_size(cache->sectors_per_block,
2496                                              cache->origin_sectors);
2497         cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2498                                                               cache->discard_block_size));
2499         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2500         if (!cache->discard_bitset) {
2501                 *error = "could not allocate discard bitset";
2502                 goto bad;
2503         }
2504         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2505
2506         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2507         if (IS_ERR(cache->copier)) {
2508                 *error = "could not create kcopyd client";
2509                 r = PTR_ERR(cache->copier);
2510                 goto bad;
2511         }
2512
2513         cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2514         if (!cache->wq) {
2515                 *error = "could not create workqueue for metadata object";
2516                 goto bad;
2517         }
2518         INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2519         INIT_WORK(&cache->migration_worker, check_migrations);
2520         INIT_DELAYED_WORK(&cache->waker, do_waker);
2521
2522         cache->prison = dm_bio_prison_create_v2(cache->wq);
2523         if (!cache->prison) {
2524                 *error = "could not create bio prison";
2525                 goto bad;
2526         }
2527
2528         r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
2529                                    migration_cache);
2530         if (r) {
2531                 *error = "Error creating cache's migration mempool";
2532                 goto bad;
2533         }
2534
2535         cache->need_tick_bio = true;
2536         cache->sized = false;
2537         cache->invalidate = false;
2538         cache->commit_requested = false;
2539         cache->loaded_mappings = false;
2540         cache->loaded_discards = false;
2541
2542         load_stats(cache);
2543
2544         atomic_set(&cache->stats.demotion, 0);
2545         atomic_set(&cache->stats.promotion, 0);
2546         atomic_set(&cache->stats.copies_avoided, 0);
2547         atomic_set(&cache->stats.cache_cell_clash, 0);
2548         atomic_set(&cache->stats.commit_count, 0);
2549         atomic_set(&cache->stats.discard_count, 0);
2550
2551         spin_lock_init(&cache->invalidation_lock);
2552         INIT_LIST_HEAD(&cache->invalidation_requests);
2553
2554         batcher_init(&cache->committer, commit_op, cache,
2555                      issue_op, cache, cache->wq);
2556         dm_iot_init(&cache->tracker);
2557
2558         init_rwsem(&cache->background_work_lock);
2559         prevent_background_work(cache);
2560
2561         *result = cache;
2562         return 0;
2563 bad:
2564         destroy(cache);
2565         return r;
2566 }
2567
2568 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2569 {
2570         unsigned int i;
2571         const char **copy;
2572
2573         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2574         if (!copy)
2575                 return -ENOMEM;
2576         for (i = 0; i < argc; i++) {
2577                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2578                 if (!copy[i]) {
2579                         while (i--)
2580                                 kfree(copy[i]);
2581                         kfree(copy);
2582                         return -ENOMEM;
2583                 }
2584         }
2585
2586         cache->nr_ctr_args = argc;
2587         cache->ctr_args = copy;
2588
2589         return 0;
2590 }
2591
2592 static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2593 {
2594         int r = -EINVAL;
2595         struct cache_args *ca;
2596         struct cache *cache = NULL;
2597
2598         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2599         if (!ca) {
2600                 ti->error = "Error allocating memory for cache";
2601                 return -ENOMEM;
2602         }
2603         ca->ti = ti;
2604
2605         r = parse_cache_args(ca, argc, argv, &ti->error);
2606         if (r)
2607                 goto out;
2608
2609         r = cache_create(ca, &cache);
2610         if (r)
2611                 goto out;
2612
2613         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2614         if (r) {
2615                 destroy(cache);
2616                 goto out;
2617         }
2618
2619         ti->private = cache;
2620 out:
2621         destroy_cache_args(ca);
2622         return r;
2623 }
2624
2625 /*----------------------------------------------------------------*/
2626
2627 static int cache_map(struct dm_target *ti, struct bio *bio)
2628 {
2629         struct cache *cache = ti->private;
2630
2631         int r;
2632         bool commit_needed;
2633         dm_oblock_t block = get_bio_block(cache, bio);
2634
2635         init_per_bio_data(bio);
2636         if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2637                 /*
2638                  * This can only occur if the io goes to a partial block at
2639                  * the end of the origin device.  We don't cache these.
2640                  * Just remap to the origin and carry on.
2641                  */
2642                 remap_to_origin(cache, bio);
2643                 accounted_begin(cache, bio);
2644                 return DM_MAPIO_REMAPPED;
2645         }
2646
2647         if (discard_or_flush(bio)) {
2648                 defer_bio(cache, bio);
2649                 return DM_MAPIO_SUBMITTED;
2650         }
2651
2652         r = map_bio(cache, bio, block, &commit_needed);
2653         if (commit_needed)
2654                 schedule_commit(&cache->committer);
2655
2656         return r;
2657 }
2658
2659 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
2660 {
2661         struct cache *cache = ti->private;
2662         unsigned long flags;
2663         struct per_bio_data *pb = get_per_bio_data(bio);
2664
2665         if (pb->tick) {
2666                 policy_tick(cache->policy, false);
2667
2668                 spin_lock_irqsave(&cache->lock, flags);
2669                 cache->need_tick_bio = true;
2670                 spin_unlock_irqrestore(&cache->lock, flags);
2671         }
2672
2673         bio_drop_shared_lock(cache, bio);
2674         accounted_complete(cache, bio);
2675
2676         return DM_ENDIO_DONE;
2677 }
2678
2679 static int write_dirty_bitset(struct cache *cache)
2680 {
2681         int r;
2682
2683         if (get_cache_mode(cache) >= CM_READ_ONLY)
2684                 return -EINVAL;
2685
2686         r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2687         if (r)
2688                 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2689
2690         return r;
2691 }
2692
2693 static int write_discard_bitset(struct cache *cache)
2694 {
2695         unsigned int i, r;
2696
2697         if (get_cache_mode(cache) >= CM_READ_ONLY)
2698                 return -EINVAL;
2699
2700         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2701                                            cache->discard_nr_blocks);
2702         if (r) {
2703                 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2704                 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2705                 return r;
2706         }
2707
2708         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2709                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2710                                          is_discarded(cache, to_dblock(i)));
2711                 if (r) {
2712                         metadata_operation_failed(cache, "dm_cache_set_discard", r);
2713                         return r;
2714                 }
2715         }
2716
2717         return 0;
2718 }
2719
2720 static int write_hints(struct cache *cache)
2721 {
2722         int r;
2723
2724         if (get_cache_mode(cache) >= CM_READ_ONLY)
2725                 return -EINVAL;
2726
2727         r = dm_cache_write_hints(cache->cmd, cache->policy);
2728         if (r) {
2729                 metadata_operation_failed(cache, "dm_cache_write_hints", r);
2730                 return r;
2731         }
2732
2733         return 0;
2734 }
2735
2736 /*
2737  * returns true on success
2738  */
2739 static bool sync_metadata(struct cache *cache)
2740 {
2741         int r1, r2, r3, r4;
2742
2743         r1 = write_dirty_bitset(cache);
2744         if (r1)
2745                 DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2746
2747         r2 = write_discard_bitset(cache);
2748         if (r2)
2749                 DMERR("%s: could not write discard bitset", cache_device_name(cache));
2750
2751         save_stats(cache);
2752
2753         r3 = write_hints(cache);
2754         if (r3)
2755                 DMERR("%s: could not write hints", cache_device_name(cache));
2756
2757         /*
2758          * If writing the above metadata failed, we still commit, but don't
2759          * set the clean shutdown flag.  This will effectively force every
2760          * dirty bit to be set on reload.
2761          */
2762         r4 = commit(cache, !r1 && !r2 && !r3);
2763         if (r4)
2764                 DMERR("%s: could not write cache metadata", cache_device_name(cache));
2765
2766         return !r1 && !r2 && !r3 && !r4;
2767 }
2768
2769 static void cache_postsuspend(struct dm_target *ti)
2770 {
2771         struct cache *cache = ti->private;
2772
2773         prevent_background_work(cache);
2774         BUG_ON(atomic_read(&cache->nr_io_migrations));
2775
2776         cancel_delayed_work_sync(&cache->waker);
2777         drain_workqueue(cache->wq);
2778         WARN_ON(cache->tracker.in_flight);
2779
2780         /*
2781          * If it's a flush suspend there won't be any deferred bios, so this
2782          * call is harmless.
2783          */
2784         requeue_deferred_bios(cache);
2785
2786         if (get_cache_mode(cache) == CM_WRITE)
2787                 (void) sync_metadata(cache);
2788 }
2789
2790 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2791                         bool dirty, uint32_t hint, bool hint_valid)
2792 {
2793         struct cache *cache = context;
2794
2795         if (dirty) {
2796                 set_bit(from_cblock(cblock), cache->dirty_bitset);
2797                 atomic_inc(&cache->nr_dirty);
2798         } else
2799                 clear_bit(from_cblock(cblock), cache->dirty_bitset);
2800
2801         return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
2802 }
2803
2804 /*
2805  * The discard block size in the on disk metadata is not
2806  * necessarily the same as we're currently using.  So we have to
2807  * be careful to only set the discarded attribute if we know it
2808  * covers a complete block of the new size.
2809  */
2810 struct discard_load_info {
2811         struct cache *cache;
2812
2813         /*
2814          * These blocks are sized using the on disk dblock size, rather
2815          * than the current one.
2816          */
2817         dm_block_t block_size;
2818         dm_block_t discard_begin, discard_end;
2819 };
2820
2821 static void discard_load_info_init(struct cache *cache,
2822                                    struct discard_load_info *li)
2823 {
2824         li->cache = cache;
2825         li->discard_begin = li->discard_end = 0;
2826 }
2827
2828 static void set_discard_range(struct discard_load_info *li)
2829 {
2830         sector_t b, e;
2831
2832         if (li->discard_begin == li->discard_end)
2833                 return;
2834
2835         /*
2836          * Convert to sectors.
2837          */
2838         b = li->discard_begin * li->block_size;
2839         e = li->discard_end * li->block_size;
2840
2841         /*
2842          * Then convert back to the current dblock size.
2843          */
2844         b = dm_sector_div_up(b, li->cache->discard_block_size);
2845         sector_div(e, li->cache->discard_block_size);
2846
2847         /*
2848          * The origin may have shrunk, so we need to check we're still in
2849          * bounds.
2850          */
2851         if (e > from_dblock(li->cache->discard_nr_blocks))
2852                 e = from_dblock(li->cache->discard_nr_blocks);
2853
2854         for (; b < e; b++)
2855                 set_discard(li->cache, to_dblock(b));
2856 }
2857
2858 static int load_discard(void *context, sector_t discard_block_size,
2859                         dm_dblock_t dblock, bool discard)
2860 {
2861         struct discard_load_info *li = context;
2862
2863         li->block_size = discard_block_size;
2864
2865         if (discard) {
2866                 if (from_dblock(dblock) == li->discard_end)
2867                         /*
2868                          * We're already in a discard range, just extend it.
2869                          */
2870                         li->discard_end = li->discard_end + 1ULL;
2871
2872                 else {
2873                         /*
2874                          * Emit the old range and start a new one.
2875                          */
2876                         set_discard_range(li);
2877                         li->discard_begin = from_dblock(dblock);
2878                         li->discard_end = li->discard_begin + 1ULL;
2879                 }
2880         } else {
2881                 set_discard_range(li);
2882                 li->discard_begin = li->discard_end = 0;
2883         }
2884
2885         return 0;
2886 }
2887
2888 static dm_cblock_t get_cache_dev_size(struct cache *cache)
2889 {
2890         sector_t size = get_dev_size(cache->cache_dev);
2891         (void) sector_div(size, cache->sectors_per_block);
2892         return to_cblock(size);
2893 }
2894
2895 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2896 {
2897         if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
2898                 if (cache->sized) {
2899                         DMERR("%s: unable to extend cache due to missing cache table reload",
2900                               cache_device_name(cache));
2901                         return false;
2902                 }
2903         }
2904
2905         /*
2906          * We can't drop a dirty block when shrinking the cache.
2907          */
2908         while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2909                 new_size = to_cblock(from_cblock(new_size) + 1);
2910                 if (is_dirty(cache, new_size)) {
2911                         DMERR("%s: unable to shrink cache; cache block %llu is dirty",
2912                               cache_device_name(cache),
2913                               (unsigned long long) from_cblock(new_size));
2914                         return false;
2915                 }
2916         }
2917
2918         return true;
2919 }
2920
2921 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2922 {
2923         int r;
2924
2925         r = dm_cache_resize(cache->cmd, new_size);
2926         if (r) {
2927                 DMERR("%s: could not resize cache metadata", cache_device_name(cache));
2928                 metadata_operation_failed(cache, "dm_cache_resize", r);
2929                 return r;
2930         }
2931
2932         set_cache_size(cache, new_size);
2933
2934         return 0;
2935 }
2936
2937 static int cache_preresume(struct dm_target *ti)
2938 {
2939         int r = 0;
2940         struct cache *cache = ti->private;
2941         dm_cblock_t csize = get_cache_dev_size(cache);
2942
2943         /*
2944          * Check to see if the cache has resized.
2945          */
2946         if (!cache->sized) {
2947                 r = resize_cache_dev(cache, csize);
2948                 if (r)
2949                         return r;
2950
2951                 cache->sized = true;
2952
2953         } else if (csize != cache->cache_size) {
2954                 if (!can_resize(cache, csize))
2955                         return -EINVAL;
2956
2957                 r = resize_cache_dev(cache, csize);
2958                 if (r)
2959                         return r;
2960         }
2961
2962         if (!cache->loaded_mappings) {
2963                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2964                                            load_mapping, cache);
2965                 if (r) {
2966                         DMERR("%s: could not load cache mappings", cache_device_name(cache));
2967                         metadata_operation_failed(cache, "dm_cache_load_mappings", r);
2968                         return r;
2969                 }
2970
2971                 cache->loaded_mappings = true;
2972         }
2973
2974         if (!cache->loaded_discards) {
2975                 struct discard_load_info li;
2976
2977                 /*
2978                  * The discard bitset could have been resized, or the
2979                  * discard block size changed.  To be safe we start by
2980                  * setting every dblock to not discarded.
2981                  */
2982                 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2983
2984                 discard_load_info_init(cache, &li);
2985                 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
2986                 if (r) {
2987                         DMERR("%s: could not load origin discards", cache_device_name(cache));
2988                         metadata_operation_failed(cache, "dm_cache_load_discards", r);
2989                         return r;
2990                 }
2991                 set_discard_range(&li);
2992
2993                 cache->loaded_discards = true;
2994         }
2995
2996         return r;
2997 }
2998
2999 static void cache_resume(struct dm_target *ti)
3000 {
3001         struct cache *cache = ti->private;
3002
3003         cache->need_tick_bio = true;
3004         allow_background_work(cache);
3005         do_waker(&cache->waker.work);
3006 }
3007
3008 static void emit_flags(struct cache *cache, char *result,
3009                        unsigned int maxlen, ssize_t *sz_ptr)
3010 {
3011         ssize_t sz = *sz_ptr;
3012         struct cache_features *cf = &cache->features;
3013         unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
3014
3015         DMEMIT("%u ", count);
3016
3017         if (cf->metadata_version == 2)
3018                 DMEMIT("metadata2 ");
3019
3020         if (writethrough_mode(cache))
3021                 DMEMIT("writethrough ");
3022
3023         else if (passthrough_mode(cache))
3024                 DMEMIT("passthrough ");
3025
3026         else if (writeback_mode(cache))
3027                 DMEMIT("writeback ");
3028
3029         else {
3030                 DMEMIT("unknown ");
3031                 DMERR("%s: internal error: unknown io mode: %d",
3032                       cache_device_name(cache), (int) cf->io_mode);
3033         }
3034
3035         if (!cf->discard_passdown)
3036                 DMEMIT("no_discard_passdown ");
3037
3038         *sz_ptr = sz;
3039 }
3040
3041 /*
3042  * Status format:
3043  *
3044  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3045  * <cache block size> <#used cache blocks>/<#total cache blocks>
3046  * <#read hits> <#read misses> <#write hits> <#write misses>
3047  * <#demotions> <#promotions> <#dirty>
3048  * <#features> <features>*
3049  * <#core args> <core args>
3050  * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3051  */
3052 static void cache_status(struct dm_target *ti, status_type_t type,
3053                          unsigned int status_flags, char *result, unsigned int maxlen)
3054 {
3055         int r = 0;
3056         unsigned int i;
3057         ssize_t sz = 0;
3058         dm_block_t nr_free_blocks_metadata = 0;
3059         dm_block_t nr_blocks_metadata = 0;
3060         char buf[BDEVNAME_SIZE];
3061         struct cache *cache = ti->private;
3062         dm_cblock_t residency;
3063         bool needs_check;
3064
3065         switch (type) {
3066         case STATUSTYPE_INFO:
3067                 if (get_cache_mode(cache) == CM_FAIL) {
3068                         DMEMIT("Fail");
3069                         break;
3070                 }
3071
3072                 /* Commit to ensure statistics aren't out-of-date */
3073                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3074                         (void) commit(cache, false);
3075
3076                 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3077                 if (r) {
3078                         DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3079                               cache_device_name(cache), r);
3080                         goto err;
3081                 }
3082
3083                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3084                 if (r) {
3085                         DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3086                               cache_device_name(cache), r);
3087                         goto err;
3088                 }
3089
3090                 residency = policy_residency(cache->policy);
3091
3092                 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3093                        (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE,
3094                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3095                        (unsigned long long)nr_blocks_metadata,
3096                        (unsigned long long)cache->sectors_per_block,
3097                        (unsigned long long) from_cblock(residency),
3098                        (unsigned long long) from_cblock(cache->cache_size),
3099                        (unsigned int) atomic_read(&cache->stats.read_hit),
3100                        (unsigned int) atomic_read(&cache->stats.read_miss),
3101                        (unsigned int) atomic_read(&cache->stats.write_hit),
3102                        (unsigned int) atomic_read(&cache->stats.write_miss),
3103                        (unsigned int) atomic_read(&cache->stats.demotion),
3104                        (unsigned int) atomic_read(&cache->stats.promotion),
3105                        (unsigned long) atomic_read(&cache->nr_dirty));
3106
3107                 emit_flags(cache, result, maxlen, &sz);
3108
3109                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3110
3111                 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3112                 if (sz < maxlen) {
3113                         r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3114                         if (r)
3115                                 DMERR("%s: policy_emit_config_values returned %d",
3116                                       cache_device_name(cache), r);
3117                 }
3118
3119                 if (get_cache_mode(cache) == CM_READ_ONLY)
3120                         DMEMIT("ro ");
3121                 else
3122                         DMEMIT("rw ");
3123
3124                 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3125
3126                 if (r || needs_check)
3127                         DMEMIT("needs_check ");
3128                 else
3129                         DMEMIT("- ");
3130
3131                 break;
3132
3133         case STATUSTYPE_TABLE:
3134                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3135                 DMEMIT("%s ", buf);
3136                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3137                 DMEMIT("%s ", buf);
3138                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3139                 DMEMIT("%s", buf);
3140
3141                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
3142                         DMEMIT(" %s", cache->ctr_args[i]);
3143                 if (cache->nr_ctr_args)
3144                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3145                 break;
3146
3147         case STATUSTYPE_IMA:
3148                 DMEMIT_TARGET_NAME_VERSION(ti->type);
3149                 if (get_cache_mode(cache) == CM_FAIL)
3150                         DMEMIT(",metadata_mode=fail");
3151                 else if (get_cache_mode(cache) == CM_READ_ONLY)
3152                         DMEMIT(",metadata_mode=ro");
3153                 else
3154                         DMEMIT(",metadata_mode=rw");
3155
3156                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3157                 DMEMIT(",cache_metadata_device=%s", buf);
3158                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3159                 DMEMIT(",cache_device=%s", buf);
3160                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3161                 DMEMIT(",cache_origin_device=%s", buf);
3162                 DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n');
3163                 DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n');
3164                 DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n');
3165                 DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n');
3166                 DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y');
3167                 DMEMIT(";");
3168                 break;
3169         }
3170
3171         return;
3172
3173 err:
3174         DMEMIT("Error");
3175 }
3176
3177 /*
3178  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
3179  * the one-past-the-end value.
3180  */
3181 struct cblock_range {
3182         dm_cblock_t begin;
3183         dm_cblock_t end;
3184 };
3185
3186 /*
3187  * A cache block range can take two forms:
3188  *
3189  * i) A single cblock, eg. '3456'
3190  * ii) A begin and end cblock with a dash between, eg. 123-234
3191  */
3192 static int parse_cblock_range(struct cache *cache, const char *str,
3193                               struct cblock_range *result)
3194 {
3195         char dummy;
3196         uint64_t b, e;
3197         int r;
3198
3199         /*
3200          * Try and parse form (ii) first.
3201          */
3202         r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3203         if (r < 0)
3204                 return r;
3205
3206         if (r == 2) {
3207                 result->begin = to_cblock(b);
3208                 result->end = to_cblock(e);
3209                 return 0;
3210         }
3211
3212         /*
3213          * That didn't work, try form (i).
3214          */
3215         r = sscanf(str, "%llu%c", &b, &dummy);
3216         if (r < 0)
3217                 return r;
3218
3219         if (r == 1) {
3220                 result->begin = to_cblock(b);
3221                 result->end = to_cblock(from_cblock(result->begin) + 1u);
3222                 return 0;
3223         }
3224
3225         DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3226         return -EINVAL;
3227 }
3228
3229 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3230 {
3231         uint64_t b = from_cblock(range->begin);
3232         uint64_t e = from_cblock(range->end);
3233         uint64_t n = from_cblock(cache->cache_size);
3234
3235         if (b >= n) {
3236                 DMERR("%s: begin cblock out of range: %llu >= %llu",
3237                       cache_device_name(cache), b, n);
3238                 return -EINVAL;
3239         }
3240
3241         if (e > n) {
3242                 DMERR("%s: end cblock out of range: %llu > %llu",
3243                       cache_device_name(cache), e, n);
3244                 return -EINVAL;
3245         }
3246
3247         if (b >= e) {
3248                 DMERR("%s: invalid cblock range: %llu >= %llu",
3249                       cache_device_name(cache), b, e);
3250                 return -EINVAL;
3251         }
3252
3253         return 0;
3254 }
3255
3256 static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3257 {
3258         return to_cblock(from_cblock(b) + 1);
3259 }
3260
3261 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3262 {
3263         int r = 0;
3264
3265         /*
3266          * We don't need to do any locking here because we know we're in
3267          * passthrough mode.  There's is potential for a race between an
3268          * invalidation triggered by an io and an invalidation message.  This
3269          * is harmless, we must not worry if the policy call fails.
3270          */
3271         while (range->begin != range->end) {
3272                 r = invalidate_cblock(cache, range->begin);
3273                 if (r)
3274                         return r;
3275
3276                 range->begin = cblock_succ(range->begin);
3277         }
3278
3279         cache->commit_requested = true;
3280         return r;
3281 }
3282
3283 static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count,
3284                                               const char **cblock_ranges)
3285 {
3286         int r = 0;
3287         unsigned int i;
3288         struct cblock_range range;
3289
3290         if (!passthrough_mode(cache)) {
3291                 DMERR("%s: cache has to be in passthrough mode for invalidation",
3292                       cache_device_name(cache));
3293                 return -EPERM;
3294         }
3295
3296         for (i = 0; i < count; i++) {
3297                 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3298                 if (r)
3299                         break;
3300
3301                 r = validate_cblock_range(cache, &range);
3302                 if (r)
3303                         break;
3304
3305                 /*
3306                  * Pass begin and end origin blocks to the worker and wake it.
3307                  */
3308                 r = request_invalidation(cache, &range);
3309                 if (r)
3310                         break;
3311         }
3312
3313         return r;
3314 }
3315
3316 /*
3317  * Supports
3318  *      "<key> <value>"
3319  * and
3320  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3321  *
3322  * The key migration_threshold is supported by the cache target core.
3323  */
3324 static int cache_message(struct dm_target *ti, unsigned int argc, char **argv,
3325                          char *result, unsigned int maxlen)
3326 {
3327         struct cache *cache = ti->private;
3328
3329         if (!argc)
3330                 return -EINVAL;
3331
3332         if (get_cache_mode(cache) >= CM_READ_ONLY) {
3333                 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3334                       cache_device_name(cache));
3335                 return -EOPNOTSUPP;
3336         }
3337
3338         if (!strcasecmp(argv[0], "invalidate_cblocks"))
3339                 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3340
3341         if (argc != 2)
3342                 return -EINVAL;
3343
3344         return set_config_value(cache, argv[0], argv[1]);
3345 }
3346
3347 static int cache_iterate_devices(struct dm_target *ti,
3348                                  iterate_devices_callout_fn fn, void *data)
3349 {
3350         int r = 0;
3351         struct cache *cache = ti->private;
3352
3353         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3354         if (!r)
3355                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3356
3357         return r;
3358 }
3359
3360 /*
3361  * If discard_passdown was enabled verify that the origin device
3362  * supports discards.  Disable discard_passdown if not.
3363  */
3364 static void disable_passdown_if_not_supported(struct cache *cache)
3365 {
3366         struct block_device *origin_bdev = cache->origin_dev->bdev;
3367         struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3368         const char *reason = NULL;
3369
3370         if (!cache->features.discard_passdown)
3371                 return;
3372
3373         if (!bdev_max_discard_sectors(origin_bdev))
3374                 reason = "discard unsupported";
3375
3376         else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
3377                 reason = "max discard sectors smaller than a block";
3378
3379         if (reason) {
3380                 DMWARN("Origin device (%pg) %s: Disabling discard passdown.",
3381                        origin_bdev, reason);
3382                 cache->features.discard_passdown = false;
3383         }
3384 }
3385
3386 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3387 {
3388         struct block_device *origin_bdev = cache->origin_dev->bdev;
3389         struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3390
3391         if (!cache->features.discard_passdown) {
3392                 /* No passdown is done so setting own virtual limits */
3393                 limits->max_hw_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3394                                                        cache->origin_sectors);
3395                 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3396                 return;
3397         }
3398
3399         /*
3400          * cache_iterate_devices() is stacking both origin and fast device limits
3401          * but discards aren't passed to fast device, so inherit origin's limits.
3402          */
3403         limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
3404         limits->discard_granularity = origin_limits->discard_granularity;
3405         limits->discard_alignment = origin_limits->discard_alignment;
3406 }
3407
3408 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3409 {
3410         struct cache *cache = ti->private;
3411         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3412
3413         /*
3414          * If the system-determined stacked limits are compatible with the
3415          * cache's blocksize (io_opt is a factor) do not override them.
3416          */
3417         if (io_opt_sectors < cache->sectors_per_block ||
3418             do_div(io_opt_sectors, cache->sectors_per_block)) {
3419                 limits->io_min = cache->sectors_per_block << SECTOR_SHIFT;
3420                 limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT;
3421         }
3422
3423         disable_passdown_if_not_supported(cache);
3424         set_discard_limits(cache, limits);
3425 }
3426
3427 /*----------------------------------------------------------------*/
3428
3429 static struct target_type cache_target = {
3430         .name = "cache",
3431         .version = {2, 2, 0},
3432         .module = THIS_MODULE,
3433         .ctr = cache_ctr,
3434         .dtr = cache_dtr,
3435         .map = cache_map,
3436         .end_io = cache_end_io,
3437         .postsuspend = cache_postsuspend,
3438         .preresume = cache_preresume,
3439         .resume = cache_resume,
3440         .status = cache_status,
3441         .message = cache_message,
3442         .iterate_devices = cache_iterate_devices,
3443         .io_hints = cache_io_hints,
3444 };
3445
3446 static int __init dm_cache_init(void)
3447 {
3448         int r;
3449
3450         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3451         if (!migration_cache)
3452                 return -ENOMEM;
3453
3454         r = dm_register_target(&cache_target);
3455         if (r) {
3456                 kmem_cache_destroy(migration_cache);
3457                 return r;
3458         }
3459
3460         return 0;
3461 }
3462
3463 static void __exit dm_cache_exit(void)
3464 {
3465         dm_unregister_target(&cache_target);
3466         kmem_cache_destroy(migration_cache);
3467 }
3468
3469 module_init(dm_cache_init);
3470 module_exit(dm_cache_exit);
3471
3472 MODULE_DESCRIPTION(DM_NAME " cache target");
3473 MODULE_AUTHOR("Joe Thornber <[email protected]>");
3474 MODULE_LICENSE("GPL");