Git Repo - linux.git/blob - drivers/md/dm-thin.c

1 // SPDX-License-Identifier: GPL-2.0-only

2 /*

4 *

5 * This file is released under the GPL.

6 */

8 #include "dm-thin-metadata.h"

9 #include "dm-bio-prison-v1.h"

10 #include "dm.h"

12 #include <linux/device-mapper.h>

13 #include <linux/dm-io.h>

14 #include <linux/dm-kcopyd.h>

15 #include <linux/jiffies.h>

16 #include <linux/log2.h>

17 #include <linux/list.h>

18 #include <linux/rculist.h>

19 #include <linux/init.h>

20 #include <linux/module.h>

21 #include <linux/slab.h>

22 #include <linux/vmalloc.h>

23 #include <linux/sort.h>

24 #include <linux/rbtree.h>

26 #define DM_MSG_PREFIX "thin"

28 /*

29 * Tunable constants

30 */

31 #define ENDIO_HOOK_POOL_SIZE 1024

32 #define MAPPING_POOL_SIZE 1024

33 #define COMMIT_PERIOD HZ

34 #define NO_SPACE_TIMEOUT_SECS 60

36 static unsigned int no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;

38 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,

39 "A percentage of time allocated for copy on write");

41 /*

42 * The block size of the device holding pool data must be

43 * between 64KB and 1GB.

44 */

45 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)

46 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)

48 /*

49 * Device id is restricted to 24 bits.

50 */

51 #define MAX_DEV_ID ((1 << 24) - 1)

53 /*

54 * How do we handle breaking sharing of data blocks?

55 * =================================================

56 *

57 * We use a standard copy-on-write btree to store the mappings for the

58 * devices (note I'm talking about copy-on-write of the metadata here, not

59 * the data). When you take an internal snapshot you clone the root node

60 * of the origin btree. After this there is no concept of an origin or a

61 * snapshot. They are just two device trees that happen to point to the

62 * same data blocks.

63 *

64 * When we get a write in we decide if it's to a shared data block using

65 * some timestamp magic. If it is, we have to break sharing.

66 *

67 * Let's say we write to a shared block in what was the origin. The

68 * steps are:

69 *

70 * i) plug io further to this physical block. (see bio_prison code).

71 *

72 * ii) quiesce any read io to that shared data block. Obviously

73 * including all devices that share this block. (see dm_deferred_set code)

74 *

75 * iii) copy the data block to a newly allocate block. This step can be

76 * missed out if the io covers the block. (schedule_copy).

77 *

78 * iv) insert the new mapping into the origin's btree

79 * (process_prepared_mapping). This act of inserting breaks some

80 * sharing of btree nodes between the two devices. Breaking sharing only

81 * effects the btree of that specific device. Btrees for the other

82 * devices that share the block never change. The btree for the origin

83 * device as it was after the last commit is untouched, ie. we're using

84 * persistent data structures in the functional programming sense.

85 *

86 * v) unplug io to this physical block, including the io that triggered

87 * the breaking of sharing.

88 *

89 * Steps (ii) and (iii) occur in parallel.

90 *

91 * The metadata _doesn't_ need to be committed before the io continues. We

92 * get away with this because the io is always written to a _new_ block.

93 * If there's a crash, then:

94 *

95 * - The origin mapping will point to the old origin block (the shared

96 * one). This will contain the data as it was before the io that triggered

97 * the breaking of sharing came in.

98 *

99 * - The snap mapping still points to the old block. As it would after

100 * the commit.

101 *

102 * The downside of this scheme is the timestamp magic isn't perfect, and

103 * will continue to think that data block in the snapshot device is shared

104 * even after the write to the origin has broken sharing. I suspect data

105 * blocks will typically be shared by many different devices, so we're

106 * breaking sharing n + 1 times, rather than n, where n is the number of

107 * devices that reference this data block. At the moment I think the

108 * benefits far, far outweigh the disadvantages.

109 */

110

111 /*----------------------------------------------------------------*/

112

113 /*

114 * Key building.

115 */

116 enum lock_space {

117 VIRTUAL,

118 PHYSICAL

119 };

120

121 static bool build_key(struct dm_thin_device *td, enum lock_space ls,

122 dm_block_t b, dm_block_t e, struct dm_cell_key *key)

123 {

124 key->virtual = (ls == VIRTUAL);

125 key->dev = dm_thin_dev_id(td);

126 key->block_begin = b;

127 key->block_end = e;

128

129 return dm_cell_key_has_valid_range(key);

130 }

131

132 static void build_data_key(struct dm_thin_device *td, dm_block_t b,

133 struct dm_cell_key *key)

134 {

135 (void) build_key(td, PHYSICAL, b, b + 1llu, key);

136 }

137

138 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,

139 struct dm_cell_key *key)

140 {

141 (void) build_key(td, VIRTUAL, b, b + 1llu, key);

142 }

143

144 /*----------------------------------------------------------------*/

145

146 #define THROTTLE_THRESHOLD (1 * HZ)

147

148 struct throttle {

149 struct rw_semaphore lock;

150 unsigned long threshold;

151 bool throttle_applied;

152 };

153

154 static void throttle_init(struct throttle *t)

155 {

156 init_rwsem(&t->lock);

157 t->throttle_applied = false;

158 }

159

160 static void throttle_work_start(struct throttle *t)

161 {

162 t->threshold = jiffies + THROTTLE_THRESHOLD;

163 }

164

165 static void throttle_work_update(struct throttle *t)

166 {

167 if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) {

168 down_write(&t->lock);

169 t->throttle_applied = true;

170 }

171 }

172

173 static void throttle_work_complete(struct throttle *t)

174 {

175 if (t->throttle_applied) {

176 t->throttle_applied = false;

177 up_write(&t->lock);

178 }

179 }

180

181 static void throttle_lock(struct throttle *t)

182 {

183 down_read(&t->lock);

184 }

185

186 static void throttle_unlock(struct throttle *t)

187 {

188 up_read(&t->lock);

189 }

190

191 /*----------------------------------------------------------------*/

192

193 /*

194 * A pool device ties together a metadata device and a data device. It

195 * also provides the interface for creating and destroying internal

196 * devices.

197 */

198 struct dm_thin_new_mapping;

199

200 /*

201 * The pool runs in various modes. Ordered in degraded order for comparisons.

202 */

203 enum pool_mode {

204 PM_WRITE, /* metadata may be changed */

205 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */

206

207 /*

208 * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.

209 */

210 PM_OUT_OF_METADATA_SPACE,

211 PM_READ_ONLY, /* metadata may not be changed */

212

213 PM_FAIL, /* all I/O fails */

214 };

215

216 struct pool_features {

217 enum pool_mode mode;

218

219 bool zero_new_blocks:1;

220 bool discard_enabled:1;

221 bool discard_passdown:1;

222 bool error_if_no_space:1;

223 };

224

225 struct thin_c;

226 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);

227 typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);

228 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);

229

230 #define CELL_SORT_ARRAY_SIZE 8192

231

232 struct pool {

233 struct list_head list;

234 struct dm_target *ti; /* Only set if a pool target is bound */

235

236 struct mapped_device *pool_md;

237 struct block_device *data_dev;

238 struct block_device *md_dev;

239 struct dm_pool_metadata *pmd;

240

241 dm_block_t low_water_blocks;

242 uint32_t sectors_per_block;

243 int sectors_per_block_shift;

244

245 struct pool_features pf;

246 bool low_water_triggered:1; /* A dm event has been sent */

247 bool suspended:1;

248 bool out_of_data_space:1;

249

250 struct dm_bio_prison *prison;

251 struct dm_kcopyd_client *copier;

252

253 struct work_struct worker;

254 struct workqueue_struct *wq;

255 struct throttle throttle;

256 struct delayed_work waker;

257 struct delayed_work no_space_timeout;

258

259 unsigned long last_commit_jiffies;

260 unsigned int ref_count;

261

262 spinlock_t lock;

263 struct bio_list deferred_flush_bios;

264 struct bio_list deferred_flush_completions;

265 struct list_head prepared_mappings;

266 struct list_head prepared_discards;

267 struct list_head prepared_discards_pt2;

268 struct list_head active_thins;

269

270 struct dm_deferred_set *shared_read_ds;

271 struct dm_deferred_set *all_io_ds;

272

273 struct dm_thin_new_mapping *next_mapping;

274

275 process_bio_fn process_bio;

276 process_bio_fn process_discard;

277

278 process_cell_fn process_cell;

279 process_cell_fn process_discard_cell;

280

281 process_mapping_fn process_prepared_mapping;

282 process_mapping_fn process_prepared_discard;

283 process_mapping_fn process_prepared_discard_pt2;

284

285 struct dm_bio_prison_cell **cell_sort_array;

286

287 mempool_t mapping_pool;

288 };

289

290 static void metadata_operation_failed(struct pool *pool, const char *op, int r);

291

292 static enum pool_mode get_pool_mode(struct pool *pool)

293 {

294 return pool->pf.mode;

295 }

296

297 static void notify_of_pool_mode_change(struct pool *pool)

298 {

299 static const char *descs[] = {

300 "write",

301 "out-of-data-space",

302 "read-only",

303 "read-only",

304 "fail"

305 };

306 const char *extra_desc = NULL;

307 enum pool_mode mode = get_pool_mode(pool);

308

309 if (mode == PM_OUT_OF_DATA_SPACE) {

310 if (!pool->pf.error_if_no_space)

311 extra_desc = " (queue IO)";

312 else

313 extra_desc = " (error IO)";

314 }

315

316 dm_table_event(pool->ti->table);

317 DMINFO("%s: switching pool to %s%s mode",

318 dm_device_name(pool->pool_md),

319 descs[(int)mode], extra_desc ? : "");

320 }

321

322 /*

323 * Target context for a pool.

324 */

325 struct pool_c {

326 struct dm_target *ti;

327 struct pool *pool;

328 struct dm_dev *data_dev;

329 struct dm_dev *metadata_dev;

330

331 dm_block_t low_water_blocks;

332 struct pool_features requested_pf; /* Features requested during table load */

333 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */

334 };

335

336 /*

337 * Target context for a thin.

338 */

339 struct thin_c {

340 struct list_head list;

341 struct dm_dev *pool_dev;

342 struct dm_dev *origin_dev;

343 sector_t origin_size;

344 dm_thin_id dev_id;

345

346 struct pool *pool;

347 struct dm_thin_device *td;

348 struct mapped_device *thin_md;

349

350 bool requeue_mode:1;

351 spinlock_t lock;

352 struct list_head deferred_cells;

353 struct bio_list deferred_bio_list;

354 struct bio_list retry_on_resume_list;

355 struct rb_root sort_bio_list; /* sorted list of deferred bios */

356

357 /*

358 * Ensures the thin is not destroyed until the worker has finished

359 * iterating the active_thins list.

360 */

361 refcount_t refcount;

362 struct completion can_destroy;

363 };

364

365 /*----------------------------------------------------------------*/

366

367 static bool block_size_is_power_of_two(struct pool *pool)

368 {

369 return pool->sectors_per_block_shift >= 0;

370 }

371

372 static sector_t block_to_sectors(struct pool *pool, dm_block_t b)

373 {

374 return block_size_is_power_of_two(pool) ?

375 (b << pool->sectors_per_block_shift) :

376 (b * pool->sectors_per_block);

377 }

378

379 /*----------------------------------------------------------------*/

380

381 struct discard_op {

382 struct thin_c *tc;

383 struct blk_plug plug;

384 struct bio *parent_bio;

385 struct bio *bio;

386 };

387

388 static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)

389 {

390 BUG_ON(!parent);

391

392 op->tc = tc;

393 blk_start_plug(&op->plug);

394 op->parent_bio = parent;

395 op->bio = NULL;

396 }

397

398 static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)

399 {

400 struct thin_c *tc = op->tc;

401 sector_t s = block_to_sectors(tc->pool, data_b);

402 sector_t len = block_to_sectors(tc->pool, data_e - data_b);

403

404 return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);

405 }

406

407 static void end_discard(struct discard_op *op, int r)

408 {

409 if (op->bio) {

410 /*

411 * Even if one of the calls to issue_discard failed, we

412 * need to wait for the chain to complete.

413 */

414 bio_chain(op->bio, op->parent_bio);

415 op->bio->bi_opf = REQ_OP_DISCARD;

416 submit_bio(op->bio);

417 }

418

419 blk_finish_plug(&op->plug);

420

421 /*

422 * Even if r is set, there could be sub discards in flight that we

423 * need to wait for.

424 */

425 if (r && !op->parent_bio->bi_status)

426 op->parent_bio->bi_status = errno_to_blk_status(r);

427 bio_endio(op->parent_bio);

428 }

429

430 /*----------------------------------------------------------------*/

431

432 /*

433 * wake_worker() is used when new work is queued and when pool_resume is

434 * ready to continue deferred IO processing.

435 */

436 static void wake_worker(struct pool *pool)

437 {

438 queue_work(pool->wq, &pool->worker);

439 }

440

441 /*----------------------------------------------------------------*/

442

443 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,

444 struct dm_bio_prison_cell **cell_result)

445 {

446 int r;

447 struct dm_bio_prison_cell *cell_prealloc;

448

449 /*

450 * Allocate a cell from the prison's mempool.

451 * This might block but it can't fail.

452 */

453 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);

454

455 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);

456 if (r) {

457 /*

458 * We reused an old cell; we can get rid of

459 * the new one.

460 */

461 dm_bio_prison_free_cell(pool->prison, cell_prealloc);

462 }

463

464 return r;

465 }

466

467 static void cell_release(struct pool *pool,

468 struct dm_bio_prison_cell *cell,

469 struct bio_list *bios)

470 {

471 dm_cell_release(pool->prison, cell, bios);

472 dm_bio_prison_free_cell(pool->prison, cell);

473 }

474

475 static void cell_visit_release(struct pool *pool,

476 void (*fn)(void *, struct dm_bio_prison_cell *),

477 void *context,

478 struct dm_bio_prison_cell *cell)

479 {

480 dm_cell_visit_release(pool->prison, fn, context, cell);

481 dm_bio_prison_free_cell(pool->prison, cell);

482 }

483

484 static void cell_release_no_holder(struct pool *pool,

485 struct dm_bio_prison_cell *cell,

486 struct bio_list *bios)

487 {

488 dm_cell_release_no_holder(pool->prison, cell, bios);

489 dm_bio_prison_free_cell(pool->prison, cell);

490 }

491

492 static void cell_error_with_code(struct pool *pool,

493 struct dm_bio_prison_cell *cell, blk_status_t error_code)

494 {

495 dm_cell_error(pool->prison, cell, error_code);

496 dm_bio_prison_free_cell(pool->prison, cell);

497 }

498

499 static blk_status_t get_pool_io_error_code(struct pool *pool)

500 {

501 return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;

502 }

503

504 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)

505 {

506 cell_error_with_code(pool, cell, get_pool_io_error_code(pool));

507 }

508

509 static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)

510 {

511 cell_error_with_code(pool, cell, 0);

512 }

513

514 static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)

515 {

516 cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);

517 }

518

519 /*----------------------------------------------------------------*/

520

521 /*

522 * A global list of pools that uses a struct mapped_device as a key.

523 */

524 static struct dm_thin_pool_table {

525 struct mutex mutex;

526 struct list_head pools;

527 } dm_thin_pool_table;

528

529 static void pool_table_init(void)

530 {

531 mutex_init(&dm_thin_pool_table.mutex);

532 INIT_LIST_HEAD(&dm_thin_pool_table.pools);

533 }

534

535 static void pool_table_exit(void)

536 {

537 mutex_destroy(&dm_thin_pool_table.mutex);

538 }

539

540 static void __pool_table_insert(struct pool *pool)

541 {

542 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

543 list_add(&pool->list, &dm_thin_pool_table.pools);

544 }

545

546 static void __pool_table_remove(struct pool *pool)

547 {

548 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

549 list_del(&pool->list);

550 }

551

552 static struct pool *__pool_table_lookup(struct mapped_device *md)

553 {

554 struct pool *pool = NULL, *tmp;

555

556 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

557

558 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {

559 if (tmp->pool_md == md) {

560 pool = tmp;

561 break;

562 }

563 }

564

565 return pool;

566 }

567

568 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)

569 {

570 struct pool *pool = NULL, *tmp;

571

572 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

573

574 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {

575 if (tmp->md_dev == md_dev) {

576 pool = tmp;

577 break;

578 }

579 }

580

581 return pool;

582 }

583

584 /*----------------------------------------------------------------*/

585

586 struct dm_thin_endio_hook {

587 struct thin_c *tc;

588 struct dm_deferred_entry *shared_read_entry;

589 struct dm_deferred_entry *all_io_entry;

590 struct dm_thin_new_mapping *overwrite_mapping;

591 struct rb_node rb_node;

592 struct dm_bio_prison_cell *cell;

593 };

594

595 static void error_bio_list(struct bio_list *bios, blk_status_t error)

596 {

597 struct bio *bio;

598

599 while ((bio = bio_list_pop(bios))) {

600 bio->bi_status = error;

601 bio_endio(bio);

602 }

603 }

604

605 static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,

606 blk_status_t error)

607 {

608 struct bio_list bios;

609

610 bio_list_init(&bios);

611

612 spin_lock_irq(&tc->lock);

613 bio_list_merge_init(&bios, master);

614 spin_unlock_irq(&tc->lock);

615

616 error_bio_list(&bios, error);

617 }

618

619 static void requeue_deferred_cells(struct thin_c *tc)

620 {

621 struct pool *pool = tc->pool;

622 struct list_head cells;

623 struct dm_bio_prison_cell *cell, *tmp;

624

625 INIT_LIST_HEAD(&cells);

626

627 spin_lock_irq(&tc->lock);

628 list_splice_init(&tc->deferred_cells, &cells);

629 spin_unlock_irq(&tc->lock);

630

631 list_for_each_entry_safe(cell, tmp, &cells, user_list)

632 cell_requeue(pool, cell);

633 }

634

635 static void requeue_io(struct thin_c *tc)

636 {

637 struct bio_list bios;

638

639 bio_list_init(&bios);

640

641 spin_lock_irq(&tc->lock);

642 bio_list_merge_init(&bios, &tc->deferred_bio_list);

643 bio_list_merge_init(&bios, &tc->retry_on_resume_list);

644 spin_unlock_irq(&tc->lock);

645

646 error_bio_list(&bios, BLK_STS_DM_REQUEUE);

647 requeue_deferred_cells(tc);

648 }

649

650 static void error_retry_list_with_code(struct pool *pool, blk_status_t error)

651 {

652 struct thin_c *tc;

653

654 rcu_read_lock();

655 list_for_each_entry_rcu(tc, &pool->active_thins, list)

656 error_thin_bio_list(tc, &tc->retry_on_resume_list, error);

657 rcu_read_unlock();

658 }

659

660 static void error_retry_list(struct pool *pool)

661 {

662 error_retry_list_with_code(pool, get_pool_io_error_code(pool));

663 }

664

665 /*

666 * This section of code contains the logic for processing a thin device's IO.

667 * Much of the code depends on pool object resources (lists, workqueues, etc)

668 * but most is exclusively called from the thin target rather than the thin-pool

669 * target.

670 */

671

672 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)

673 {

674 struct pool *pool = tc->pool;

675 sector_t block_nr = bio->bi_iter.bi_sector;

676

677 if (block_size_is_power_of_two(pool))

678 block_nr >>= pool->sectors_per_block_shift;

679 else

680 (void) sector_div(block_nr, pool->sectors_per_block);

681

682 return block_nr;

683 }

684

685 /*

686 * Returns the _complete_ blocks that this bio covers.

687 */

688 static void get_bio_block_range(struct thin_c *tc, struct bio *bio,

689 dm_block_t *begin, dm_block_t *end)

690 {

691 struct pool *pool = tc->pool;

692 sector_t b = bio->bi_iter.bi_sector;

693 sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);

694

695 b += pool->sectors_per_block - 1ull; /* so we round up */

696

697 if (block_size_is_power_of_two(pool)) {

698 b >>= pool->sectors_per_block_shift;

699 e >>= pool->sectors_per_block_shift;

700 } else {

701 (void) sector_div(b, pool->sectors_per_block);

702 (void) sector_div(e, pool->sectors_per_block);

703 }

704

705 if (e < b) {

706 /* Can happen if the bio is within a single block. */

707 e = b;

708 }

709

710 *begin = b;

711 *end = e;

712 }

713

714 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)

715 {

716 struct pool *pool = tc->pool;

717 sector_t bi_sector = bio->bi_iter.bi_sector;

718

719 bio_set_dev(bio, tc->pool_dev->bdev);

720 if (block_size_is_power_of_two(pool)) {

721 bio->bi_iter.bi_sector =

722 (block << pool->sectors_per_block_shift) |

723 (bi_sector & (pool->sectors_per_block - 1));

724 } else {

725 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +

726 sector_div(bi_sector, pool->sectors_per_block);

727 }

728 }

729

730 static void remap_to_origin(struct thin_c *tc, struct bio *bio)

731 {

732 bio_set_dev(bio, tc->origin_dev->bdev);

733 }

734

735 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)

736 {

737 return op_is_flush(bio->bi_opf) &&

738 dm_thin_changed_this_transaction(tc->td);

739 }

740

741 static void inc_all_io_entry(struct pool *pool, struct bio *bio)

742 {

743 struct dm_thin_endio_hook *h;

744

745 if (bio_op(bio) == REQ_OP_DISCARD)

746 return;

747

748 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

749 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);

750 }

751

752 static void issue(struct thin_c *tc, struct bio *bio)

753 {

754 struct pool *pool = tc->pool;

755

756 if (!bio_triggers_commit(tc, bio)) {

757 dm_submit_bio_remap(bio, NULL);

758 return;

759 }

760

761 /*

762 * Complete bio with an error if earlier I/O caused changes to

763 * the metadata that can't be committed e.g, due to I/O errors

764 * on the metadata device.

765 */

766 if (dm_thin_aborted_changes(tc->td)) {

767 bio_io_error(bio);

768 return;

769 }

770

771 /*

772 * Batch together any bios that trigger commits and then issue a

773 * single commit for them in process_deferred_bios().

774 */

775 spin_lock_irq(&pool->lock);

776 bio_list_add(&pool->deferred_flush_bios, bio);

777 spin_unlock_irq(&pool->lock);

778 }

779

780 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)

781 {

782 remap_to_origin(tc, bio);

783 issue(tc, bio);

784 }

785

786 static void remap_and_issue(struct thin_c *tc, struct bio *bio,

787 dm_block_t block)

788 {

789 remap(tc, bio, block);

790 issue(tc, bio);

791 }

792

793 /*----------------------------------------------------------------*/

794

795 /*

796 * Bio endio functions.

797 */

798 struct dm_thin_new_mapping {

799 struct list_head list;

800

801 bool pass_discard:1;

802 bool maybe_shared:1;

803

804 /*

805 * Track quiescing, copying and zeroing preparation actions. When this

806 * counter hits zero the block is prepared and can be inserted into the

807 * btree.

808 */

809 atomic_t prepare_actions;

810

811 blk_status_t status;

812 struct thin_c *tc;

813 dm_block_t virt_begin, virt_end;

814 dm_block_t data_block;

815 struct dm_bio_prison_cell *cell;

816

817 /*

818 * If the bio covers the whole area of a block then we can avoid

819 * zeroing or copying. Instead this bio is hooked. The bio will

820 * still be in the cell, so care has to be taken to avoid issuing

821 * the bio twice.

822 */

823 struct bio *bio;

824 bio_end_io_t *saved_bi_end_io;

825 };

826

827 static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)

828 {

829 struct pool *pool = m->tc->pool;

830

831 if (atomic_dec_and_test(&m->prepare_actions)) {

832 list_add_tail(&m->list, &pool->prepared_mappings);

833 wake_worker(pool);

834 }

835 }

836

837 static void complete_mapping_preparation(struct dm_thin_new_mapping *m)

838 {

839 unsigned long flags;

840 struct pool *pool = m->tc->pool;

841

842 spin_lock_irqsave(&pool->lock, flags);

843 __complete_mapping_preparation(m);

844 spin_unlock_irqrestore(&pool->lock, flags);

845 }

846

847 static void copy_complete(int read_err, unsigned long write_err, void *context)

848 {

849 struct dm_thin_new_mapping *m = context;

850

851 m->status = read_err || write_err ? BLK_STS_IOERR : 0;

852 complete_mapping_preparation(m);

853 }

854

855 static void overwrite_endio(struct bio *bio)

856 {

857 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

858 struct dm_thin_new_mapping *m = h->overwrite_mapping;

859

860 bio->bi_end_io = m->saved_bi_end_io;

861

862 m->status = bio->bi_status;

863 complete_mapping_preparation(m);

864 }

865

866 /*----------------------------------------------------------------*/

867

868 /*

869 * Workqueue.

870 */

871

872 /*

873 * Prepared mapping jobs.

874 */

875

876 /*

877 * This sends the bios in the cell, except the original holder, back

878 * to the deferred_bios list.

879 */

880 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)

881 {

882 struct pool *pool = tc->pool;

883 unsigned long flags;

884 struct bio_list bios;

885

886 bio_list_init(&bios);

887 cell_release_no_holder(pool, cell, &bios);

888

889 if (!bio_list_empty(&bios)) {

890 spin_lock_irqsave(&tc->lock, flags);

891 bio_list_merge(&tc->deferred_bio_list, &bios);

892 spin_unlock_irqrestore(&tc->lock, flags);

893 wake_worker(pool);

894 }

895 }

896

897 static void thin_defer_bio(struct thin_c *tc, struct bio *bio);

898

899 struct remap_info {

900 struct thin_c *tc;

901 struct bio_list defer_bios;

902 struct bio_list issue_bios;

903 };

904

905 static void __inc_remap_and_issue_cell(void *context,

906 struct dm_bio_prison_cell *cell)

907 {

908 struct remap_info *info = context;

909 struct bio *bio;

910

911 while ((bio = bio_list_pop(&cell->bios))) {

912 if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)

913 bio_list_add(&info->defer_bios, bio);

914 else {

915 inc_all_io_entry(info->tc->pool, bio);

916

917 /*

918 * We can't issue the bios with the bio prison lock

919 * held, so we add them to a list to issue on

920 * return from this function.

921 */

922 bio_list_add(&info->issue_bios, bio);

923 }

924 }

925 }

926

927 static void inc_remap_and_issue_cell(struct thin_c *tc,

928 struct dm_bio_prison_cell *cell,

929 dm_block_t block)

930 {

931 struct bio *bio;

932 struct remap_info info;

933

934 info.tc = tc;

935 bio_list_init(&info.defer_bios);

936 bio_list_init(&info.issue_bios);

937

938 /*

939 * We have to be careful to inc any bios we're about to issue

940 * before the cell is released, and avoid a race with new bios

941 * being added to the cell.

942 */

943 cell_visit_release(tc->pool, __inc_remap_and_issue_cell,

944 &info, cell);

945

946 while ((bio = bio_list_pop(&info.defer_bios)))

947 thin_defer_bio(tc, bio);

948

949 while ((bio = bio_list_pop(&info.issue_bios)))

950 remap_and_issue(info.tc, bio, block);

951 }

952

953 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)

954 {

955 cell_error(m->tc->pool, m->cell);

956 list_del(&m->list);

957 mempool_free(m, &m->tc->pool->mapping_pool);

958 }

959

960 static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)

961 {

962 struct pool *pool = tc->pool;

963

964 /*

965 * If the bio has the REQ_FUA flag set we must commit the metadata

966 * before signaling its completion.

967 */

968 if (!bio_triggers_commit(tc, bio)) {

969 bio_endio(bio);

970 return;

971 }

972

973 /*

974 * Complete bio with an error if earlier I/O caused changes to the

975 * metadata that can't be committed, e.g, due to I/O errors on the

976 * metadata device.

977 */

978 if (dm_thin_aborted_changes(tc->td)) {

979 bio_io_error(bio);

980 return;

981 }

982

983 /*

984 * Batch together any bios that trigger commits and then issue a

985 * single commit for them in process_deferred_bios().

986 */

987 spin_lock_irq(&pool->lock);

988 bio_list_add(&pool->deferred_flush_completions, bio);

989 spin_unlock_irq(&pool->lock);

990 }

991

992 static void process_prepared_mapping(struct dm_thin_new_mapping *m)

993 {

994 struct thin_c *tc = m->tc;

995 struct pool *pool = tc->pool;

996 struct bio *bio = m->bio;

997 int r;

998

999 if (m->status) {

1000 cell_error(pool, m->cell);

1001 goto out;

1002 }

1003

1004 /*

1005 * Commit the prepared block into the mapping btree.

1006 * Any I/O for this block arriving after this point will get

1007 * remapped to it directly.

1008 */

1009 r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);

1010 if (r) {

1011 metadata_operation_failed(pool, "dm_thin_insert_block", r);

1012 cell_error(pool, m->cell);

1013 goto out;

1014 }

1015

1016 /*

1017 * Release any bios held while the block was being provisioned.

1018 * If we are processing a write bio that completely covers the block,

1019 * we already processed it so can ignore it now when processing

1020 * the bios in the cell.

1021 */

1022 if (bio) {

1023 inc_remap_and_issue_cell(tc, m->cell, m->data_block);

1024 complete_overwrite_bio(tc, bio);

1025 } else {

1026 inc_all_io_entry(tc->pool, m->cell->holder);

1027 remap_and_issue(tc, m->cell->holder, m->data_block);

1028 inc_remap_and_issue_cell(tc, m->cell, m->data_block);

1029 }

1030

1031 out:

1032 list_del(&m->list);

1033 mempool_free(m, &pool->mapping_pool);

1034 }

1035

1036 /*----------------------------------------------------------------*/

1037

1038 static void free_discard_mapping(struct dm_thin_new_mapping *m)

1039 {

1040 struct thin_c *tc = m->tc;

1041

1042 if (m->cell)

1043 cell_defer_no_holder(tc, m->cell);

1044 mempool_free(m, &tc->pool->mapping_pool);

1045 }

1046

1047 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)

1048 {

1049 bio_io_error(m->bio);

1050 free_discard_mapping(m);

1051 }

1052

1053 static void process_prepared_discard_success(struct dm_thin_new_mapping *m)

1054 {

1055 bio_endio(m->bio);

1056 free_discard_mapping(m);

1057 }

1058

1059 static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)

1060 {

1061 int r;

1062 struct thin_c *tc = m->tc;

1063

1064 r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);

1065 if (r) {

1066 metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);

1067 bio_io_error(m->bio);

1068 } else

1069 bio_endio(m->bio);

1070

1071 cell_defer_no_holder(tc, m->cell);

1072 mempool_free(m, &tc->pool->mapping_pool);

1073 }

1074

1075 /*----------------------------------------------------------------*/

1076

1077 static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,

1078 struct bio *discard_parent)

1079 {

1080 /*

1081 * We've already unmapped this range of blocks, but before we

1082 * passdown we have to check that these blocks are now unused.

1083 */

1084 int r = 0;

1085 bool shared = true;

1086 struct thin_c *tc = m->tc;

1087 struct pool *pool = tc->pool;

1088 dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;

1089 struct discard_op op;

1090

1091 begin_discard(&op, tc, discard_parent);

1092 while (b != end) {

1093 /* find start of unmapped run */

1094 for (; b < end; b++) {

1095 r = dm_pool_block_is_shared(pool->pmd, b, &shared);

1096 if (r)

1097 goto out;

1098

1099 if (!shared)

1100 break;

1101 }

1102

1103 if (b == end)

1104 break;

1105

1106 /* find end of run */

1107 for (e = b + 1; e != end; e++) {

1108 r = dm_pool_block_is_shared(pool->pmd, e, &shared);

1109 if (r)

1110 goto out;

1111

1112 if (shared)

1113 break;

1114 }

1115

1116 r = issue_discard(&op, b, e);

1117 if (r)

1118 goto out;

1119

1120 b = e;

1121 }

1122 out:

1123 end_discard(&op, r);

1124 }

1125

1126 static void queue_passdown_pt2(struct dm_thin_new_mapping *m)

1127 {

1128 unsigned long flags;

1129 struct pool *pool = m->tc->pool;

1130

1131 spin_lock_irqsave(&pool->lock, flags);

1132 list_add_tail(&m->list, &pool->prepared_discards_pt2);

1133 spin_unlock_irqrestore(&pool->lock, flags);

1134 wake_worker(pool);

1135 }

1136

1137 static void passdown_endio(struct bio *bio)

1138 {

1139 /*

1140 * It doesn't matter if the passdown discard failed, we still want

1141 * to unmap (we ignore err).

1142 */

1143 queue_passdown_pt2(bio->bi_private);

1144 bio_put(bio);

1145 }

1146

1147 static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)

1148 {

1149 int r;

1150 struct thin_c *tc = m->tc;

1151 struct pool *pool = tc->pool;

1152 struct bio *discard_parent;

1153 dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);

1154

1155 /*

1156 * Only this thread allocates blocks, so we can be sure that the

1157 * newly unmapped blocks will not be allocated before the end of

1158 * the function.

1159 */

1160 r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);

1161 if (r) {

1162 metadata_operation_failed(pool, "dm_thin_remove_range", r);

1163 bio_io_error(m->bio);

1164 cell_defer_no_holder(tc, m->cell);

1165 mempool_free(m, &pool->mapping_pool);

1166 return;

1167 }

1168

1169 /*

1170 * Increment the unmapped blocks. This prevents a race between the

1171 * passdown io and reallocation of freed blocks.

1172 */

1173 r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);

1174 if (r) {

1175 metadata_operation_failed(pool, "dm_pool_inc_data_range", r);

1176 bio_io_error(m->bio);

1177 cell_defer_no_holder(tc, m->cell);

1178 mempool_free(m, &pool->mapping_pool);

1179 return;

1180 }

1181

1182 discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO);

1183 discard_parent->bi_end_io = passdown_endio;

1184 discard_parent->bi_private = m;

1185 if (m->maybe_shared)

1186 passdown_double_checking_shared_status(m, discard_parent);

1187 else {

1188 struct discard_op op;

1189

1190 begin_discard(&op, tc, discard_parent);

1191 r = issue_discard(&op, m->data_block, data_end);

1192 end_discard(&op, r);

1193 }

1194 }

1195

1196 static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)

1197 {

1198 int r;

1199 struct thin_c *tc = m->tc;

1200 struct pool *pool = tc->pool;

1201

1202 /*

1203 * The passdown has completed, so now we can decrement all those

1204 * unmapped blocks.

1205 */

1206 r = dm_pool_dec_data_range(pool->pmd, m->data_block,

1207 m->data_block + (m->virt_end - m->virt_begin));

1208 if (r) {

1209 metadata_operation_failed(pool, "dm_pool_dec_data_range", r);

1210 bio_io_error(m->bio);

1211 } else

1212 bio_endio(m->bio);

1213

1214 cell_defer_no_holder(tc, m->cell);

1215 mempool_free(m, &pool->mapping_pool);

1216 }

1217

1218 static void process_prepared(struct pool *pool, struct list_head *head,

1219 process_mapping_fn *fn)

1220 {

1221 struct list_head maps;

1222 struct dm_thin_new_mapping *m, *tmp;

1223

1224 INIT_LIST_HEAD(&maps);

1225 spin_lock_irq(&pool->lock);

1226 list_splice_init(head, &maps);

1227 spin_unlock_irq(&pool->lock);

1228

1229 list_for_each_entry_safe(m, tmp, &maps, list)

1230 (*fn)(m);

1231 }

1232

1233 /*

1234 * Deferred bio jobs.

1235 */

1236 static int io_overlaps_block(struct pool *pool, struct bio *bio)

1237 {

1238 return bio->bi_iter.bi_size ==

1239 (pool->sectors_per_block << SECTOR_SHIFT);

1240 }

1241

1242 static int io_overwrites_block(struct pool *pool, struct bio *bio)

1243 {

1244 return (bio_data_dir(bio) == WRITE) &&

1245 io_overlaps_block(pool, bio);

1246 }

1247

1248 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,

1249 bio_end_io_t *fn)

1250 {

1251 *save = bio->bi_end_io;

1252 bio->bi_end_io = fn;

1253 }

1254

1255 static int ensure_next_mapping(struct pool *pool)

1256 {

1257 if (pool->next_mapping)

1258 return 0;

1259

1260 pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);

1261

1262 return pool->next_mapping ? 0 : -ENOMEM;

1263 }

1264

1265 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)

1266 {

1267 struct dm_thin_new_mapping *m = pool->next_mapping;

1268

1269 BUG_ON(!pool->next_mapping);

1270

1271 memset(m, 0, sizeof(struct dm_thin_new_mapping));

1272 INIT_LIST_HEAD(&m->list);

1273 m->bio = NULL;

1274

1275 pool->next_mapping = NULL;

1276

1277 return m;

1278 }

1279

1280 static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,

1281 sector_t begin, sector_t end)

1282 {

1283 struct dm_io_region to;

1284

1285 to.bdev = tc->pool_dev->bdev;

1286 to.sector = begin;

1287 to.count = end - begin;

1288

1289 dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);

1290 }

1291

1292 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,

1293 dm_block_t data_begin,

1294 struct dm_thin_new_mapping *m)

1295 {

1296 struct pool *pool = tc->pool;

1297 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1298

1299 h->overwrite_mapping = m;

1300 m->bio = bio;

1301 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);

1302 inc_all_io_entry(pool, bio);

1303 remap_and_issue(tc, bio, data_begin);

1304 }

1305

1306 /*

1307 * A partial copy also needs to zero the uncopied region.

1308 */

1309 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,

1310 struct dm_dev *origin, dm_block_t data_origin,

1311 dm_block_t data_dest,

1312 struct dm_bio_prison_cell *cell, struct bio *bio,

1313 sector_t len)

1314 {

1315 struct pool *pool = tc->pool;

1316 struct dm_thin_new_mapping *m = get_next_mapping(pool);

1317

1318 m->tc = tc;

1319 m->virt_begin = virt_block;

1320 m->virt_end = virt_block + 1u;

1321 m->data_block = data_dest;

1322 m->cell = cell;

1323

1324 /*

1325 * quiesce action + copy action + an extra reference held for the

1326 * duration of this function (we may need to inc later for a

1327 * partial zero).

1328 */

1329 atomic_set(&m->prepare_actions, 3);

1330

1331 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))

1332 complete_mapping_preparation(m); /* already quiesced */

1333

1334 /*

1335 * IO to pool_dev remaps to the pool target's data_dev.

1336 *

1337 * If the whole block of data is being overwritten, we can issue the

1338 * bio immediately. Otherwise we use kcopyd to clone the data first.

1339 */

1340 if (io_overwrites_block(pool, bio))

1341 remap_and_issue_overwrite(tc, bio, data_dest, m);

1342 else {

1343 struct dm_io_region from, to;

1344

1345 from.bdev = origin->bdev;

1346 from.sector = data_origin * pool->sectors_per_block;

1347 from.count = len;

1348

1349 to.bdev = tc->pool_dev->bdev;

1350 to.sector = data_dest * pool->sectors_per_block;

1351 to.count = len;

1352

1353 dm_kcopyd_copy(pool->copier, &from, 1, &to,

1354 0, copy_complete, m);

1355

1356 /*

1357 * Do we need to zero a tail region?

1358 */

1359 if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {

1360 atomic_inc(&m->prepare_actions);

1361 ll_zero(tc, m,

1362 data_dest * pool->sectors_per_block + len,

1363 (data_dest + 1) * pool->sectors_per_block);

1364 }

1365 }

1366

1367 complete_mapping_preparation(m); /* drop our ref */

1368 }

1369

1370 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,

1371 dm_block_t data_origin, dm_block_t data_dest,

1372 struct dm_bio_prison_cell *cell, struct bio *bio)

1373 {

1374 schedule_copy(tc, virt_block, tc->pool_dev,

1375 data_origin, data_dest, cell, bio,

1376 tc->pool->sectors_per_block);

1377 }

1378

1379 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,

1380 dm_block_t data_block, struct dm_bio_prison_cell *cell,

1381 struct bio *bio)

1382 {

1383 struct pool *pool = tc->pool;

1384 struct dm_thin_new_mapping *m = get_next_mapping(pool);

1385

1386 atomic_set(&m->prepare_actions, 1); /* no need to quiesce */

1387 m->tc = tc;

1388 m->virt_begin = virt_block;

1389 m->virt_end = virt_block + 1u;

1390 m->data_block = data_block;

1391 m->cell = cell;

1392

1393 /*

1394 * If the whole block of data is being overwritten or we are not

1395 * zeroing pre-existing data, we can issue the bio immediately.

1396 * Otherwise we use kcopyd to zero the data first.

1397 */

1398 if (pool->pf.zero_new_blocks) {

1399 if (io_overwrites_block(pool, bio))

1400 remap_and_issue_overwrite(tc, bio, data_block, m);

1401 else {

1402 ll_zero(tc, m, data_block * pool->sectors_per_block,

1403 (data_block + 1) * pool->sectors_per_block);

1404 }

1405 } else

1406 process_prepared_mapping(m);

1407 }

1408

1409 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,

1410 dm_block_t data_dest,

1411 struct dm_bio_prison_cell *cell, struct bio *bio)

1412 {

1413 struct pool *pool = tc->pool;

1414 sector_t virt_block_begin = virt_block * pool->sectors_per_block;

1415 sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;

1416

1417 if (virt_block_end <= tc->origin_size) {

1418 schedule_copy(tc, virt_block, tc->origin_dev,

1419 virt_block, data_dest, cell, bio,

1420 pool->sectors_per_block);

1421

1422 } else if (virt_block_begin < tc->origin_size) {

1423 schedule_copy(tc, virt_block, tc->origin_dev,

1424 virt_block, data_dest, cell, bio,

1425 tc->origin_size - virt_block_begin);

1426

1427 } else

1428 schedule_zero(tc, virt_block, data_dest, cell, bio);

1429 }

1430

1431 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);

1432

1433 static void requeue_bios(struct pool *pool);

1434

1435 static bool is_read_only_pool_mode(enum pool_mode mode)

1436 {

1437 return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);

1438 }

1439

1440 static bool is_read_only(struct pool *pool)

1441 {

1442 return is_read_only_pool_mode(get_pool_mode(pool));

1443 }

1444

1445 static void check_for_metadata_space(struct pool *pool)

1446 {

1447 int r;

1448 const char *ooms_reason = NULL;

1449 dm_block_t nr_free;

1450

1451 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);

1452 if (r)

1453 ooms_reason = "Could not get free metadata blocks";

1454 else if (!nr_free)

1455 ooms_reason = "No free metadata blocks";

1456

1457 if (ooms_reason && !is_read_only(pool)) {

1458 DMERR("%s", ooms_reason);

1459 set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);

1460 }

1461 }

1462

1463 static void check_for_data_space(struct pool *pool)

1464 {

1465 int r;

1466 dm_block_t nr_free;

1467

1468 if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)

1469 return;

1470

1471 r = dm_pool_get_free_block_count(pool->pmd, &nr_free);

1472 if (r)

1473 return;

1474

1475 if (nr_free) {

1476 set_pool_mode(pool, PM_WRITE);

1477 requeue_bios(pool);

1478 }

1479 }

1480

1481 /*

1482 * A non-zero return indicates read_only or fail_io mode.

1483 * Many callers don't care about the return value.

1484 */

1485 static int commit(struct pool *pool)

1486 {

1487 int r;

1488

1489 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)

1490 return -EINVAL;

1491

1492 r = dm_pool_commit_metadata(pool->pmd);

1493 if (r)

1494 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);

1495 else {

1496 check_for_metadata_space(pool);

1497 check_for_data_space(pool);

1498 }

1499

1500 return r;

1501 }

1502

1503 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)

1504 {

1505 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {

1506 DMWARN("%s: reached low water mark for data device: sending event.",

1507 dm_device_name(pool->pool_md));

1508 spin_lock_irq(&pool->lock);

1509 pool->low_water_triggered = true;

1510 spin_unlock_irq(&pool->lock);

1511 dm_table_event(pool->ti->table);

1512 }

1513 }

1514

1515 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)

1516 {

1517 int r;

1518 dm_block_t free_blocks;

1519 struct pool *pool = tc->pool;

1520

1521 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))

1522 return -EINVAL;

1523

1524 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);

1525 if (r) {

1526 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);

1527 return r;

1528 }

1529

1530 check_low_water_mark(pool, free_blocks);

1531

1532 if (!free_blocks) {

1533 /*

1534 * Try to commit to see if that will free up some

1535 * more space.

1536 */

1537 r = commit(pool);

1538 if (r)

1539 return r;

1540

1541 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);

1542 if (r) {

1543 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);

1544 return r;

1545 }

1546

1547 if (!free_blocks) {

1548 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);

1549 return -ENOSPC;

1550 }

1551 }

1552

1553 r = dm_pool_alloc_data_block(pool->pmd, result);

1554 if (r) {

1555 if (r == -ENOSPC)

1556 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);

1557 else

1558 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);

1559 return r;

1560 }

1561

1562 r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);

1563 if (r) {

1564 metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);

1565 return r;

1566 }

1567

1568 if (!free_blocks) {

1569 /* Let's commit before we use up the metadata reserve. */

1570 r = commit(pool);

1571 if (r)

1572 return r;

1573 }

1574

1575 return 0;

1576 }

1577

1578 /*

1579 * If we have run out of space, queue bios until the device is

1580 * resumed, presumably after having been reloaded with more space.

1581 */

1582 static void retry_on_resume(struct bio *bio)

1583 {

1584 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1585 struct thin_c *tc = h->tc;

1586

1587 spin_lock_irq(&tc->lock);

1588 bio_list_add(&tc->retry_on_resume_list, bio);

1589 spin_unlock_irq(&tc->lock);

1590 }

1591

1592 static blk_status_t should_error_unserviceable_bio(struct pool *pool)

1593 {

1594 enum pool_mode m = get_pool_mode(pool);

1595

1596 switch (m) {

1597 case PM_WRITE:

1598 /* Shouldn't get here */

1599 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");

1600 return BLK_STS_IOERR;

1601

1602 case PM_OUT_OF_DATA_SPACE:

1603 return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;

1604

1605 case PM_OUT_OF_METADATA_SPACE:

1606 case PM_READ_ONLY:

1607 case PM_FAIL:

1608 return BLK_STS_IOERR;

1609 default:

1610 /* Shouldn't get here */

1611 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");

1612 return BLK_STS_IOERR;

1613 }

1614 }

1615

1616 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)

1617 {

1618 blk_status_t error = should_error_unserviceable_bio(pool);

1619

1620 if (error) {

1621 bio->bi_status = error;

1622 bio_endio(bio);

1623 } else

1624 retry_on_resume(bio);

1625 }

1626

1627 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)

1628 {

1629 struct bio *bio;

1630 struct bio_list bios;

1631 blk_status_t error;

1632

1633 error = should_error_unserviceable_bio(pool);

1634 if (error) {

1635 cell_error_with_code(pool, cell, error);

1636 return;

1637 }

1638

1639 bio_list_init(&bios);

1640 cell_release(pool, cell, &bios);

1641

1642 while ((bio = bio_list_pop(&bios)))

1643 retry_on_resume(bio);

1644 }

1645

1646 static void process_discard_cell_no_passdown(struct thin_c *tc,

1647 struct dm_bio_prison_cell *virt_cell)

1648 {

1649 struct pool *pool = tc->pool;

1650 struct dm_thin_new_mapping *m = get_next_mapping(pool);

1651

1652 /*

1653 * We don't need to lock the data blocks, since there's no

1654 * passdown. We only lock data blocks for allocation and breaking sharing.

1655 */

1656 m->tc = tc;

1657 m->virt_begin = virt_cell->key.block_begin;

1658 m->virt_end = virt_cell->key.block_end;

1659 m->cell = virt_cell;

1660 m->bio = virt_cell->holder;

1661

1662 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))

1663 pool->process_prepared_discard(m);

1664 }

1665

1666 static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,

1667 struct bio *bio)

1668 {

1669 struct pool *pool = tc->pool;

1670

1671 int r;

1672 bool maybe_shared;

1673 struct dm_cell_key data_key;

1674 struct dm_bio_prison_cell *data_cell;

1675 struct dm_thin_new_mapping *m;

1676 dm_block_t virt_begin, virt_end, data_begin, data_end;

1677 dm_block_t len, next_boundary;

1678

1679 while (begin != end) {

1680 r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,

1681 &data_begin, &maybe_shared);

1682 if (r) {

1683 /*

1684 * Silently fail, letting any mappings we've

1685 * created complete.

1686 */

1687 break;

1688 }

1689

1690 data_end = data_begin + (virt_end - virt_begin);

1691

1692 /*

1693 * Make sure the data region obeys the bio prison restrictions.

1694 */

1695 while (data_begin < data_end) {

1696 r = ensure_next_mapping(pool);

1697 if (r)

1698 return; /* we did our best */

1699

1700 next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)

1701 << BIO_PRISON_MAX_RANGE_SHIFT;

1702 len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);

1703

1704 /* This key is certainly within range given the above splitting */

1705 (void) build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);

1706 if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {

1707 /* contention, we'll give up with this range */

1708 data_begin += len;

1709 continue;

1710 }

1711

1712 /*

1713 * IO may still be going to the destination block. We must

1714 * quiesce before we can do the removal.

1715 */

1716 m = get_next_mapping(pool);

1717 m->tc = tc;

1718 m->maybe_shared = maybe_shared;

1719 m->virt_begin = virt_begin;

1720 m->virt_end = virt_begin + len;

1721 m->data_block = data_begin;

1722 m->cell = data_cell;

1723 m->bio = bio;

1724

1725 /*

1726 * The parent bio must not complete before sub discard bios are

1727 * chained to it (see end_discard's bio_chain)!

1728 *

1729 * This per-mapping bi_remaining increment is paired with

1730 * the implicit decrement that occurs via bio_endio() in

1731 * end_discard().

1732 */

1733 bio_inc_remaining(bio);

1734 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))

1735 pool->process_prepared_discard(m);

1736

1737 virt_begin += len;

1738 data_begin += len;

1739 }

1740

1741 begin = virt_end;

1742 }

1743 }

1744

1745 static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)

1746 {

1747 struct bio *bio = virt_cell->holder;

1748 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1749

1750 /*

1751 * The virt_cell will only get freed once the origin bio completes.

1752 * This means it will remain locked while all the individual

1753 * passdown bios are in flight.

1754 */

1755 h->cell = virt_cell;

1756 break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);

1757

1758 /*

1759 * We complete the bio now, knowing that the bi_remaining field

1760 * will prevent completion until the sub range discards have

1761 * completed.

1762 */

1763 bio_endio(bio);

1764 }

1765

1766 static void process_discard_bio(struct thin_c *tc, struct bio *bio)

1767 {

1768 dm_block_t begin, end;

1769 struct dm_cell_key virt_key;

1770 struct dm_bio_prison_cell *virt_cell;

1771

1772 get_bio_block_range(tc, bio, &begin, &end);

1773 if (begin == end) {

1774 /*

1775 * The discard covers less than a block.

1776 */

1777 bio_endio(bio);

1778 return;

1779 }

1780

1781 if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {

1782 DMERR_LIMIT("Discard doesn't respect bio prison limits");

1783 bio_endio(bio);

1784 return;

1785 }

1786

1787 if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) {

1788 /*

1789 * Potential starvation issue: We're relying on the

1790 * fs/application being well behaved, and not trying to

1791 * send IO to a region at the same time as discarding it.

1792 * If they do this persistently then it's possible this

1793 * cell will never be granted.

1794 */

1795 return;

1796 }

1797

1798 tc->pool->process_discard_cell(tc, virt_cell);

1799 }

1800

1801 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,

1802 struct dm_cell_key *key,

1803 struct dm_thin_lookup_result *lookup_result,

1804 struct dm_bio_prison_cell *cell)

1805 {

1806 int r;

1807 dm_block_t data_block;

1808 struct pool *pool = tc->pool;

1809

1810 r = alloc_data_block(tc, &data_block);

1811 switch (r) {

1812 case 0:

1813 schedule_internal_copy(tc, block, lookup_result->block,

1814 data_block, cell, bio);

1815 break;

1816

1817 case -ENOSPC:

1818 retry_bios_on_resume(pool, cell);

1819 break;

1820

1821 default:

1822 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",

1823 __func__, r);

1824 cell_error(pool, cell);

1825 break;

1826 }

1827 }

1828

1829 static void __remap_and_issue_shared_cell(void *context,

1830 struct dm_bio_prison_cell *cell)

1831 {

1832 struct remap_info *info = context;

1833 struct bio *bio;

1834

1835 while ((bio = bio_list_pop(&cell->bios))) {

1836 if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||

1837 bio_op(bio) == REQ_OP_DISCARD)

1838 bio_list_add(&info->defer_bios, bio);

1839 else {

1840 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1841

1842 h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);

1843 inc_all_io_entry(info->tc->pool, bio);

1844 bio_list_add(&info->issue_bios, bio);

1845 }

1846 }

1847 }

1848

1849 static void remap_and_issue_shared_cell(struct thin_c *tc,

1850 struct dm_bio_prison_cell *cell,

1851 dm_block_t block)

1852 {

1853 struct bio *bio;

1854 struct remap_info info;

1855

1856 info.tc = tc;

1857 bio_list_init(&info.defer_bios);

1858 bio_list_init(&info.issue_bios);

1859

1860 cell_visit_release(tc->pool, __remap_and_issue_shared_cell,

1861 &info, cell);

1862

1863 while ((bio = bio_list_pop(&info.defer_bios)))

1864 thin_defer_bio(tc, bio);

1865

1866 while ((bio = bio_list_pop(&info.issue_bios)))

1867 remap_and_issue(tc, bio, block);

1868 }

1869

1870 static void process_shared_bio(struct thin_c *tc, struct bio *bio,

1871 dm_block_t block,

1872 struct dm_thin_lookup_result *lookup_result,

1873 struct dm_bio_prison_cell *virt_cell)

1874 {

1875 struct dm_bio_prison_cell *data_cell;

1876 struct pool *pool = tc->pool;

1877 struct dm_cell_key key;

1878

1879 /*

1880 * If cell is already occupied, then sharing is already in the process

1881 * of being broken so we have nothing further to do here.

1882 */

1883 build_data_key(tc->td, lookup_result->block, &key);

1884 if (bio_detain(pool, &key, bio, &data_cell)) {

1885 cell_defer_no_holder(tc, virt_cell);

1886 return;

1887 }

1888

1889 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {

1890 break_sharing(tc, bio, block, &key, lookup_result, data_cell);

1891 cell_defer_no_holder(tc, virt_cell);

1892 } else {

1893 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1894

1895 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);

1896 inc_all_io_entry(pool, bio);

1897 remap_and_issue(tc, bio, lookup_result->block);

1898

1899 remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);

1900 remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);

1901 }

1902 }

1903

1904 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,

1905 struct dm_bio_prison_cell *cell)

1906 {

1907 int r;

1908 dm_block_t data_block;

1909 struct pool *pool = tc->pool;

1910

1911 /*

1912 * Remap empty bios (flushes) immediately, without provisioning.

1913 */

1914 if (!bio->bi_iter.bi_size) {

1915 inc_all_io_entry(pool, bio);

1916 cell_defer_no_holder(tc, cell);

1917

1918 remap_and_issue(tc, bio, 0);

1919 return;

1920 }

1921

1922 /*

1923 * Fill read bios with zeroes and complete them immediately.

1924 */

1925 if (bio_data_dir(bio) == READ) {

1926 zero_fill_bio(bio);

1927 cell_defer_no_holder(tc, cell);

1928 bio_endio(bio);

1929 return;

1930 }

1931

1932 r = alloc_data_block(tc, &data_block);

1933 switch (r) {

1934 case 0:

1935 if (tc->origin_dev)

1936 schedule_external_copy(tc, block, data_block, cell, bio);

1937 else

1938 schedule_zero(tc, block, data_block, cell, bio);

1939 break;

1940

1941 case -ENOSPC:

1942 retry_bios_on_resume(pool, cell);

1943 break;

1944

1945 default:

1946 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",

1947 __func__, r);

1948 cell_error(pool, cell);

1949 break;

1950 }

1951 }

1952

1953 static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1954 {

1955 int r;

1956 struct pool *pool = tc->pool;

1957 struct bio *bio = cell->holder;

1958 dm_block_t block = get_bio_block(tc, bio);

1959 struct dm_thin_lookup_result lookup_result;

1960

1961 if (tc->requeue_mode) {

1962 cell_requeue(pool, cell);

1963 return;

1964 }

1965

1966 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

1967 switch (r) {

1968 case 0:

1969 if (lookup_result.shared)

1970 process_shared_bio(tc, bio, block, &lookup_result, cell);

1971 else {

1972 inc_all_io_entry(pool, bio);

1973 remap_and_issue(tc, bio, lookup_result.block);

1974 inc_remap_and_issue_cell(tc, cell, lookup_result.block);

1975 }

1976 break;

1977

1978 case -ENODATA:

1979 if (bio_data_dir(bio) == READ && tc->origin_dev) {

1980 inc_all_io_entry(pool, bio);

1981 cell_defer_no_holder(tc, cell);

1982

1983 if (bio_end_sector(bio) <= tc->origin_size)

1984 remap_to_origin_and_issue(tc, bio);

1985

1986 else if (bio->bi_iter.bi_sector < tc->origin_size) {

1987 zero_fill_bio(bio);

1988 bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;

1989 remap_to_origin_and_issue(tc, bio);

1990

1991 } else {

1992 zero_fill_bio(bio);

1993 bio_endio(bio);

1994 }

1995 } else

1996 provision_block(tc, bio, block, cell);

1997 break;

1998

1999 default:

2000 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

2001 __func__, r);

2002 cell_defer_no_holder(tc, cell);

2003 bio_io_error(bio);

2004 break;

2005 }

2006 }

2007

2008 static void process_bio(struct thin_c *tc, struct bio *bio)

2009 {

2010 struct pool *pool = tc->pool;

2011 dm_block_t block = get_bio_block(tc, bio);

2012 struct dm_bio_prison_cell *cell;

2013 struct dm_cell_key key;

2014

2015 /*

2016 * If cell is already occupied, then the block is already

2017 * being provisioned so we have nothing further to do here.

2018 */

2019 build_virtual_key(tc->td, block, &key);

2020 if (bio_detain(pool, &key, bio, &cell))

2021 return;

2022

2023 process_cell(tc, cell);

2024 }

2025

2026 static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,

2027 struct dm_bio_prison_cell *cell)

2028 {

2029 int r;

2030 int rw = bio_data_dir(bio);

2031 dm_block_t block = get_bio_block(tc, bio);

2032 struct dm_thin_lookup_result lookup_result;

2033

2034 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

2035 switch (r) {

2036 case 0:

2037 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {

2038 handle_unserviceable_bio(tc->pool, bio);

2039 if (cell)

2040 cell_defer_no_holder(tc, cell);

2041 } else {

2042 inc_all_io_entry(tc->pool, bio);

2043 remap_and_issue(tc, bio, lookup_result.block);

2044 if (cell)

2045 inc_remap_and_issue_cell(tc, cell, lookup_result.block);

2046 }

2047 break;

2048

2049 case -ENODATA:

2050 if (cell)

2051 cell_defer_no_holder(tc, cell);

2052 if (rw != READ) {

2053 handle_unserviceable_bio(tc->pool, bio);

2054 break;

2055 }

2056

2057 if (tc->origin_dev) {

2058 inc_all_io_entry(tc->pool, bio);

2059 remap_to_origin_and_issue(tc, bio);

2060 break;

2061 }

2062

2063 zero_fill_bio(bio);

2064 bio_endio(bio);

2065 break;

2066

2067 default:

2068 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

2069 __func__, r);

2070 if (cell)

2071 cell_defer_no_holder(tc, cell);

2072 bio_io_error(bio);

2073 break;

2074 }

2075 }

2076

2077 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)

2078 {

2079 __process_bio_read_only(tc, bio, NULL);

2080 }

2081

2082 static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)

2083 {

2084 __process_bio_read_only(tc, cell->holder, cell);

2085 }

2086

2087 static void process_bio_success(struct thin_c *tc, struct bio *bio)

2088 {

2089 bio_endio(bio);

2090 }

2091

2092 static void process_bio_fail(struct thin_c *tc, struct bio *bio)

2093 {

2094 bio_io_error(bio);

2095 }

2096

2097 static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)

2098 {

2099 cell_success(tc->pool, cell);

2100 }

2101

2102 static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)

2103 {

2104 cell_error(tc->pool, cell);

2105 }

2106

2107 /*

2108 * FIXME: should we also commit due to size of transaction, measured in

2109 * metadata blocks?

2110 */

2111 static int need_commit_due_to_time(struct pool *pool)

2112 {

2113 return !time_in_range(jiffies, pool->last_commit_jiffies,

2114 pool->last_commit_jiffies + COMMIT_PERIOD);

2115 }

2116

2117 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)

2118 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))

2119

2120 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)

2121 {

2122 struct rb_node **rbp, *parent;

2123 struct dm_thin_endio_hook *pbd;

2124 sector_t bi_sector = bio->bi_iter.bi_sector;

2125

2126 rbp = &tc->sort_bio_list.rb_node;

2127 parent = NULL;

2128 while (*rbp) {

2129 parent = *rbp;

2130 pbd = thin_pbd(parent);

2131

2132 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)

2133 rbp = &(*rbp)->rb_left;

2134 else

2135 rbp = &(*rbp)->rb_right;

2136 }

2137

2138 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

2139 rb_link_node(&pbd->rb_node, parent, rbp);

2140 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);

2141 }

2142

2143 static void __extract_sorted_bios(struct thin_c *tc)

2144 {

2145 struct rb_node *node;

2146 struct dm_thin_endio_hook *pbd;

2147 struct bio *bio;

2148

2149 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {

2150 pbd = thin_pbd(node);

2151 bio = thin_bio(pbd);

2152

2153 bio_list_add(&tc->deferred_bio_list, bio);

2154 rb_erase(&pbd->rb_node, &tc->sort_bio_list);

2155 }

2156

2157 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));

2158 }

2159

2160 static void __sort_thin_deferred_bios(struct thin_c *tc)

2161 {

2162 struct bio *bio;

2163 struct bio_list bios;

2164

2165 bio_list_init(&bios);

2166 bio_list_merge(&bios, &tc->deferred_bio_list);

2167 bio_list_init(&tc->deferred_bio_list);

2168

2169 /* Sort deferred_bio_list using rb-tree */

2170 while ((bio = bio_list_pop(&bios)))

2171 __thin_bio_rb_add(tc, bio);

2172

2173 /*

2174 * Transfer the sorted bios in sort_bio_list back to

2175 * deferred_bio_list to allow lockless submission of

2176 * all bios.

2177 */

2178 __extract_sorted_bios(tc);

2179 }

2180

2181 static void process_thin_deferred_bios(struct thin_c *tc)

2182 {

2183 struct pool *pool = tc->pool;

2184 struct bio *bio;

2185 struct bio_list bios;

2186 struct blk_plug plug;

2187 unsigned int count = 0;

2188

2189 if (tc->requeue_mode) {

2190 error_thin_bio_list(tc, &tc->deferred_bio_list,

2191 BLK_STS_DM_REQUEUE);

2192 return;

2193 }

2194

2195 bio_list_init(&bios);

2196

2197 spin_lock_irq(&tc->lock);

2198

2199 if (bio_list_empty(&tc->deferred_bio_list)) {

2200 spin_unlock_irq(&tc->lock);

2201 return;

2202 }

2203

2204 __sort_thin_deferred_bios(tc);

2205

2206 bio_list_merge(&bios, &tc->deferred_bio_list);

2207 bio_list_init(&tc->deferred_bio_list);

2208

2209 spin_unlock_irq(&tc->lock);

2210

2211 blk_start_plug(&plug);

2212 while ((bio = bio_list_pop(&bios))) {

2213 /*

2214 * If we've got no free new_mapping structs, and processing

2215 * this bio might require one, we pause until there are some

2216 * prepared mappings to process.

2217 */

2218 if (ensure_next_mapping(pool)) {

2219 spin_lock_irq(&tc->lock);

2220 bio_list_add(&tc->deferred_bio_list, bio);

2221 bio_list_merge(&tc->deferred_bio_list, &bios);

2222 spin_unlock_irq(&tc->lock);

2223 break;

2224 }

2225

2226 if (bio_op(bio) == REQ_OP_DISCARD)

2227 pool->process_discard(tc, bio);

2228 else

2229 pool->process_bio(tc, bio);

2230

2231 if ((count++ & 127) == 0) {

2232 throttle_work_update(&pool->throttle);

2233 dm_pool_issue_prefetches(pool->pmd);

2234 }

2235 cond_resched();

2236 }

2237 blk_finish_plug(&plug);

2238 }

2239

2240 static int cmp_cells(const void *lhs, const void *rhs)

2241 {

2242 struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);

2243 struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);

2244

2245 BUG_ON(!lhs_cell->holder);

2246 BUG_ON(!rhs_cell->holder);

2247

2248 if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)

2249 return -1;

2250

2251 if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)

2252 return 1;

2253

2254 return 0;

2255 }

2256

2257 static unsigned int sort_cells(struct pool *pool, struct list_head *cells)

2258 {

2259 unsigned int count = 0;

2260 struct dm_bio_prison_cell *cell, *tmp;

2261

2262 list_for_each_entry_safe(cell, tmp, cells, user_list) {

2263 if (count >= CELL_SORT_ARRAY_SIZE)

2264 break;

2265

2266 pool->cell_sort_array[count++] = cell;

2267 list_del(&cell->user_list);

2268 }

2269

2270 sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);

2271

2272 return count;

2273 }

2274

2275 static void process_thin_deferred_cells(struct thin_c *tc)

2276 {

2277 struct pool *pool = tc->pool;

2278 struct list_head cells;

2279 struct dm_bio_prison_cell *cell;

2280 unsigned int i, j, count;

2281

2282 INIT_LIST_HEAD(&cells);

2283

2284 spin_lock_irq(&tc->lock);

2285 list_splice_init(&tc->deferred_cells, &cells);

2286 spin_unlock_irq(&tc->lock);

2287

2288 if (list_empty(&cells))

2289 return;

2290

2291 do {

2292 count = sort_cells(tc->pool, &cells);

2293

2294 for (i = 0; i < count; i++) {

2295 cell = pool->cell_sort_array[i];

2296 BUG_ON(!cell->holder);

2297

2298 /*

2299 * If we've got no free new_mapping structs, and processing

2300 * this bio might require one, we pause until there are some

2301 * prepared mappings to process.

2302 */

2303 if (ensure_next_mapping(pool)) {

2304 for (j = i; j < count; j++)

2305 list_add(&pool->cell_sort_array[j]->user_list, &cells);

2306

2307 spin_lock_irq(&tc->lock);

2308 list_splice(&cells, &tc->deferred_cells);

2309 spin_unlock_irq(&tc->lock);

2310 return;

2311 }

2312

2313 if (bio_op(cell->holder) == REQ_OP_DISCARD)

2314 pool->process_discard_cell(tc, cell);

2315 else

2316 pool->process_cell(tc, cell);

2317 }

2318 cond_resched();

2319 } while (!list_empty(&cells));

2320 }

2321

2322 static void thin_get(struct thin_c *tc);

2323 static void thin_put(struct thin_c *tc);

2324

2325 /*

2326 * We can't hold rcu_read_lock() around code that can block. So we

2327 * find a thin with the rcu lock held; bump a refcount; then drop

2328 * the lock.

2329 */

2330 static struct thin_c *get_first_thin(struct pool *pool)

2331 {

2332 struct thin_c *tc = NULL;

2333

2334 rcu_read_lock();

2335 tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list);

2336 if (tc)

2337 thin_get(tc);

2338 rcu_read_unlock();

2339

2340 return tc;

2341 }

2342

2343 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)

2344 {

2345 struct thin_c *old_tc = tc;

2346

2347 rcu_read_lock();

2348 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {

2349 thin_get(tc);

2350 thin_put(old_tc);

2351 rcu_read_unlock();

2352 return tc;

2353 }

2354 thin_put(old_tc);

2355 rcu_read_unlock();

2356

2357 return NULL;

2358 }

2359

2360 static void process_deferred_bios(struct pool *pool)

2361 {

2362 struct bio *bio;

2363 struct bio_list bios, bio_completions;

2364 struct thin_c *tc;

2365

2366 tc = get_first_thin(pool);

2367 while (tc) {

2368 process_thin_deferred_cells(tc);

2369 process_thin_deferred_bios(tc);

2370 tc = get_next_thin(pool, tc);

2371 }

2372

2373 /*

2374 * If there are any deferred flush bios, we must commit the metadata

2375 * before issuing them or signaling their completion.

2376 */

2377 bio_list_init(&bios);

2378 bio_list_init(&bio_completions);

2379

2380 spin_lock_irq(&pool->lock);

2381 bio_list_merge(&bios, &pool->deferred_flush_bios);

2382 bio_list_init(&pool->deferred_flush_bios);

2383

2384 bio_list_merge(&bio_completions, &pool->deferred_flush_completions);

2385 bio_list_init(&pool->deferred_flush_completions);

2386 spin_unlock_irq(&pool->lock);

2387

2388 if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&

2389 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))

2390 return;

2391

2392 if (commit(pool)) {

2393 bio_list_merge(&bios, &bio_completions);

2394

2395 while ((bio = bio_list_pop(&bios)))

2396 bio_io_error(bio);

2397 return;

2398 }

2399 pool->last_commit_jiffies = jiffies;

2400

2401 while ((bio = bio_list_pop(&bio_completions)))

2402 bio_endio(bio);

2403

2404 while ((bio = bio_list_pop(&bios))) {

2405 /*

2406 * The data device was flushed as part of metadata commit,

2407 * so complete redundant flushes immediately.

2408 */

2409 if (bio->bi_opf & REQ_PREFLUSH)

2410 bio_endio(bio);

2411 else

2412 dm_submit_bio_remap(bio, NULL);

2413 }

2414 }

2415

2416 static void do_worker(struct work_struct *ws)

2417 {

2418 struct pool *pool = container_of(ws, struct pool, worker);

2419

2420 throttle_work_start(&pool->throttle);

2421 dm_pool_issue_prefetches(pool->pmd);

2422 throttle_work_update(&pool->throttle);

2423 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);

2424 throttle_work_update(&pool->throttle);

2425 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);

2426 throttle_work_update(&pool->throttle);

2427 process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);

2428 throttle_work_update(&pool->throttle);

2429 process_deferred_bios(pool);

2430 throttle_work_complete(&pool->throttle);

2431 }

2432

2433 /*

2434 * We want to commit periodically so that not too much

2435 * unwritten data builds up.

2436 */

2437 static void do_waker(struct work_struct *ws)

2438 {

2439 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);

2440

2441 wake_worker(pool);

2442 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);

2443 }

2444

2445 /*

2446 * We're holding onto IO to allow userland time to react. After the

2447 * timeout either the pool will have been resized (and thus back in

2448 * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.

2449 */

2450 static void do_no_space_timeout(struct work_struct *ws)

2451 {

2452 struct pool *pool = container_of(to_delayed_work(ws), struct pool,

2453 no_space_timeout);

2454

2455 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {

2456 pool->pf.error_if_no_space = true;

2457 notify_of_pool_mode_change(pool);

2458 error_retry_list_with_code(pool, BLK_STS_NOSPC);

2459 }

2460 }

2461

2462 /*----------------------------------------------------------------*/

2463

2464 struct pool_work {

2465 struct work_struct worker;

2466 struct completion complete;

2467 };

2468

2469 static struct pool_work *to_pool_work(struct work_struct *ws)

2470 {

2471 return container_of(ws, struct pool_work, worker);

2472 }

2473

2474 static void pool_work_complete(struct pool_work *pw)

2475 {

2476 complete(&pw->complete);

2477 }

2478

2479 static void pool_work_wait(struct pool_work *pw, struct pool *pool,

2480 void (*fn)(struct work_struct *))

2481 {

2482 INIT_WORK_ONSTACK(&pw->worker, fn);

2483 init_completion(&pw->complete);

2484 queue_work(pool->wq, &pw->worker);

2485 wait_for_completion(&pw->complete);

2486 destroy_work_on_stack(&pw->worker);

2487 }

2488

2489 /*----------------------------------------------------------------*/

2490

2491 struct noflush_work {

2492 struct pool_work pw;

2493 struct thin_c *tc;

2494 };

2495

2496 static struct noflush_work *to_noflush(struct work_struct *ws)

2497 {

2498 return container_of(to_pool_work(ws), struct noflush_work, pw);

2499 }

2500

2501 static void do_noflush_start(struct work_struct *ws)

2502 {

2503 struct noflush_work *w = to_noflush(ws);

2504

2505 w->tc->requeue_mode = true;

2506 requeue_io(w->tc);

2507 pool_work_complete(&w->pw);

2508 }

2509

2510 static void do_noflush_stop(struct work_struct *ws)

2511 {

2512 struct noflush_work *w = to_noflush(ws);

2513

2514 w->tc->requeue_mode = false;

2515 pool_work_complete(&w->pw);

2516 }

2517

2518 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))

2519 {

2520 struct noflush_work w;

2521

2522 w.tc = tc;

2523 pool_work_wait(&w.pw, tc->pool, fn);

2524 }

2525

2526 /*----------------------------------------------------------------*/

2527

2528 static void set_discard_callbacks(struct pool *pool)

2529 {

2530 struct pool_c *pt = pool->ti->private;

2531

2532 if (pt->adjusted_pf.discard_passdown) {

2533 pool->process_discard_cell = process_discard_cell_passdown;

2534 pool->process_prepared_discard = process_prepared_discard_passdown_pt1;

2535 pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;

2536 } else {

2537 pool->process_discard_cell = process_discard_cell_no_passdown;

2538 pool->process_prepared_discard = process_prepared_discard_no_passdown;

2539 }

2540 }

2541

2542 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)

2543 {

2544 struct pool_c *pt = pool->ti->private;

2545 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);

2546 enum pool_mode old_mode = get_pool_mode(pool);

2547 unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;

2548

2549 /*

2550 * Never allow the pool to transition to PM_WRITE mode if user

2551 * intervention is required to verify metadata and data consistency.

2552 */

2553 if (new_mode == PM_WRITE && needs_check) {

2554 DMERR("%s: unable to switch pool to write mode until repaired.",

2555 dm_device_name(pool->pool_md));

2556 if (old_mode != new_mode)

2557 new_mode = old_mode;

2558 else

2559 new_mode = PM_READ_ONLY;

2560 }

2561 /*

2562 * If we were in PM_FAIL mode, rollback of metadata failed. We're

2563 * not going to recover without a thin_repair. So we never let the

2564 * pool move out of the old mode.

2565 */

2566 if (old_mode == PM_FAIL)

2567 new_mode = old_mode;

2568

2569 switch (new_mode) {

2570 case PM_FAIL:

2571 dm_pool_metadata_read_only(pool->pmd);

2572 pool->process_bio = process_bio_fail;

2573 pool->process_discard = process_bio_fail;

2574 pool->process_cell = process_cell_fail;

2575 pool->process_discard_cell = process_cell_fail;

2576 pool->process_prepared_mapping = process_prepared_mapping_fail;

2577 pool->process_prepared_discard = process_prepared_discard_fail;

2578

2579 error_retry_list(pool);

2580 break;

2581

2582 case PM_OUT_OF_METADATA_SPACE:

2583 case PM_READ_ONLY:

2584 dm_pool_metadata_read_only(pool->pmd);

2585 pool->process_bio = process_bio_read_only;

2586 pool->process_discard = process_bio_success;

2587 pool->process_cell = process_cell_read_only;

2588 pool->process_discard_cell = process_cell_success;

2589 pool->process_prepared_mapping = process_prepared_mapping_fail;

2590 pool->process_prepared_discard = process_prepared_discard_success;

2591

2592 error_retry_list(pool);

2593 break;

2594

2595 case PM_OUT_OF_DATA_SPACE:

2596 /*

2597 * Ideally we'd never hit this state; the low water mark

2598 * would trigger userland to extend the pool before we

2599 * completely run out of data space. However, many small

2600 * IOs to unprovisioned space can consume data space at an

2601 * alarming rate. Adjust your low water mark if you're

2602 * frequently seeing this mode.

2603 */

2604 pool->out_of_data_space = true;

2605 pool->process_bio = process_bio_read_only;

2606 pool->process_discard = process_discard_bio;

2607 pool->process_cell = process_cell_read_only;

2608 pool->process_prepared_mapping = process_prepared_mapping;

2609 set_discard_callbacks(pool);

2610

2611 if (!pool->pf.error_if_no_space && no_space_timeout)

2612 queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);

2613 break;

2614

2615 case PM_WRITE:

2616 if (old_mode == PM_OUT_OF_DATA_SPACE)

2617 cancel_delayed_work_sync(&pool->no_space_timeout);

2618 pool->out_of_data_space = false;

2619 pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;

2620 dm_pool_metadata_read_write(pool->pmd);

2621 pool->process_bio = process_bio;

2622 pool->process_discard = process_discard_bio;

2623 pool->process_cell = process_cell;

2624 pool->process_prepared_mapping = process_prepared_mapping;

2625 set_discard_callbacks(pool);

2626 break;

2627 }

2628

2629 pool->pf.mode = new_mode;

2630 /*

2631 * The pool mode may have changed, sync it so bind_control_target()

2632 * doesn't cause an unexpected mode transition on resume.

2633 */

2634 pt->adjusted_pf.mode = new_mode;

2635

2636 if (old_mode != new_mode)

2637 notify_of_pool_mode_change(pool);

2638 }

2639

2640 static void abort_transaction(struct pool *pool)

2641 {

2642 const char *dev_name = dm_device_name(pool->pool_md);

2643

2644 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);

2645 if (dm_pool_abort_metadata(pool->pmd)) {

2646 DMERR("%s: failed to abort metadata transaction", dev_name);

2647 set_pool_mode(pool, PM_FAIL);

2648 }

2649

2650 if (dm_pool_metadata_set_needs_check(pool->pmd)) {

2651 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);

2652 set_pool_mode(pool, PM_FAIL);

2653 }

2654 }

2655

2656 static void metadata_operation_failed(struct pool *pool, const char *op, int r)

2657 {

2658 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",

2659 dm_device_name(pool->pool_md), op, r);

2660

2661 abort_transaction(pool);

2662 set_pool_mode(pool, PM_READ_ONLY);

2663 }

2664

2665 /*----------------------------------------------------------------*/

2666

2667 /*

2668 * Mapping functions.

2669 */

2670

2671 /*

2672 * Called only while mapping a thin bio to hand it over to the workqueue.

2673 */

2674 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)

2675 {

2676 struct pool *pool = tc->pool;

2677

2678 spin_lock_irq(&tc->lock);

2679 bio_list_add(&tc->deferred_bio_list, bio);

2680 spin_unlock_irq(&tc->lock);

2681

2682 wake_worker(pool);

2683 }

2684

2685 static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)

2686 {

2687 struct pool *pool = tc->pool;

2688

2689 throttle_lock(&pool->throttle);

2690 thin_defer_bio(tc, bio);

2691 throttle_unlock(&pool->throttle);

2692 }

2693

2694 static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

2695 {

2696 struct pool *pool = tc->pool;

2697

2698 throttle_lock(&pool->throttle);

2699 spin_lock_irq(&tc->lock);

2700 list_add_tail(&cell->user_list, &tc->deferred_cells);

2701 spin_unlock_irq(&tc->lock);

2702 throttle_unlock(&pool->throttle);

2703

2704 wake_worker(pool);

2705 }

2706

2707 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)

2708 {

2709 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

2710

2711 h->tc = tc;

2712 h->shared_read_entry = NULL;

2713 h->all_io_entry = NULL;

2714 h->overwrite_mapping = NULL;

2715 h->cell = NULL;

2716 }

2717

2718 /*

2719 * Non-blocking function called from the thin target's map function.

2720 */

2721 static int thin_bio_map(struct dm_target *ti, struct bio *bio)

2722 {

2723 int r;

2724 struct thin_c *tc = ti->private;

2725 dm_block_t block = get_bio_block(tc, bio);

2726 struct dm_thin_device *td = tc->td;

2727 struct dm_thin_lookup_result result;

2728 struct dm_bio_prison_cell *virt_cell, *data_cell;

2729 struct dm_cell_key key;

2730

2731 thin_hook_bio(tc, bio);

2732

2733 if (tc->requeue_mode) {

2734 bio->bi_status = BLK_STS_DM_REQUEUE;

2735 bio_endio(bio);

2736 return DM_MAPIO_SUBMITTED;

2737 }

2738

2739 if (get_pool_mode(tc->pool) == PM_FAIL) {

2740 bio_io_error(bio);

2741 return DM_MAPIO_SUBMITTED;

2742 }

2743

2744 if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {

2745 thin_defer_bio_with_throttle(tc, bio);

2746 return DM_MAPIO_SUBMITTED;

2747 }

2748

2749 /*

2750 * We must hold the virtual cell before doing the lookup, otherwise

2751 * there's a race with discard.

2752 */

2753 build_virtual_key(tc->td, block, &key);

2754 if (bio_detain(tc->pool, &key, bio, &virt_cell))

2755 return DM_MAPIO_SUBMITTED;

2756

2757 r = dm_thin_find_block(td, block, 0, &result);

2758

2759 /*

2760 * Note that we defer readahead too.

2761 */

2762 switch (r) {

2763 case 0:

2764 if (unlikely(result.shared)) {

2765 /*

2766 * We have a race condition here between the

2767 * result.shared value returned by the lookup and

2768 * snapshot creation, which may cause new

2769 * sharing.

2770 *

2771 * To avoid this always quiesce the origin before

2772 * taking the snap. You want to do this anyway to

2773 * ensure a consistent application view

2774 * (i.e. lockfs).

2775 *

2776 * More distant ancestors are irrelevant. The

2777 * shared flag will be set in their case.

2778 */

2779 thin_defer_cell(tc, virt_cell);

2780 return DM_MAPIO_SUBMITTED;

2781 }

2782

2783 build_data_key(tc->td, result.block, &key);

2784 if (bio_detain(tc->pool, &key, bio, &data_cell)) {

2785 cell_defer_no_holder(tc, virt_cell);

2786 return DM_MAPIO_SUBMITTED;

2787 }

2788

2789 inc_all_io_entry(tc->pool, bio);

2790 cell_defer_no_holder(tc, data_cell);

2791 cell_defer_no_holder(tc, virt_cell);

2792

2793 remap(tc, bio, result.block);

2794 return DM_MAPIO_REMAPPED;

2795

2796 case -ENODATA:

2797 case -EWOULDBLOCK:

2798 thin_defer_cell(tc, virt_cell);

2799 return DM_MAPIO_SUBMITTED;

2800

2801 default:

2802 /*

2803 * Must always call bio_io_error on failure.

2804 * dm_thin_find_block can fail with -EINVAL if the

2805 * pool is switched to fail-io mode.

2806 */

2807 bio_io_error(bio);

2808 cell_defer_no_holder(tc, virt_cell);

2809 return DM_MAPIO_SUBMITTED;

2810 }

2811 }

2812

2813 static void requeue_bios(struct pool *pool)

2814 {

2815 struct thin_c *tc;

2816

2817 rcu_read_lock();

2818 list_for_each_entry_rcu(tc, &pool->active_thins, list) {

2819 spin_lock_irq(&tc->lock);

2820 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);

2821 bio_list_init(&tc->retry_on_resume_list);

2822 spin_unlock_irq(&tc->lock);

2823 }

2824 rcu_read_unlock();

2825 }

2826

2827 /*

2828 *--------------------------------------------------------------

2829 * Binding of control targets to a pool object

2830 *--------------------------------------------------------------

2831 */

2832 static bool is_factor(sector_t block_size, uint32_t n)

2833 {

2834 return !sector_div(block_size, n);

2835 }

2836

2837 /*

2838 * If discard_passdown was enabled verify that the data device

2839 * supports discards. Disable discard_passdown if not.

2840 */

2841 static void disable_discard_passdown_if_not_supported(struct pool_c *pt)

2842 {

2843 struct pool *pool = pt->pool;

2844 struct block_device *data_bdev = pt->data_dev->bdev;

2845 struct queue_limits *data_limits = bdev_limits(data_bdev);

2846 const char *reason = NULL;

2847

2848 if (!pt->adjusted_pf.discard_passdown)

2849 return;

2850

2851 if (!bdev_max_discard_sectors(pt->data_dev->bdev))

2852 reason = "discard unsupported";

2853

2854 else if (data_limits->max_discard_sectors < pool->sectors_per_block)

2855 reason = "max discard sectors smaller than a block";

2856

2857 if (reason) {

2858 DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason);

2859 pt->adjusted_pf.discard_passdown = false;

2860 }

2861 }

2862

2863 static int bind_control_target(struct pool *pool, struct dm_target *ti)

2864 {

2865 struct pool_c *pt = ti->private;

2866

2867 /*

2868 * We want to make sure that a pool in PM_FAIL mode is never upgraded.

2869 */

2870 enum pool_mode old_mode = get_pool_mode(pool);

2871 enum pool_mode new_mode = pt->adjusted_pf.mode;

2872

2873 /*

2874 * Don't change the pool's mode until set_pool_mode() below.

2875 * Otherwise the pool's process_* function pointers may

2876 * not match the desired pool mode.

2877 */

2878 pt->adjusted_pf.mode = old_mode;

2879

2880 pool->ti = ti;

2881 pool->pf = pt->adjusted_pf;

2882 pool->low_water_blocks = pt->low_water_blocks;

2883

2884 set_pool_mode(pool, new_mode);

2885

2886 return 0;

2887 }

2888

2889 static void unbind_control_target(struct pool *pool, struct dm_target *ti)

2890 {

2891 if (pool->ti == ti)

2892 pool->ti = NULL;

2893 }

2894

2895 /*

2896 *--------------------------------------------------------------

2897 * Pool creation

2898 *--------------------------------------------------------------

2899 */

2900 /* Initialize pool features. */

2901 static void pool_features_init(struct pool_features *pf)

2902 {

2903 pf->mode = PM_WRITE;

2904 pf->zero_new_blocks = true;

2905 pf->discard_enabled = true;

2906 pf->discard_passdown = true;

2907 pf->error_if_no_space = false;

2908 }

2909

2910 static void __pool_destroy(struct pool *pool)

2911 {

2912 __pool_table_remove(pool);

2913

2914 vfree(pool->cell_sort_array);

2915 if (dm_pool_metadata_close(pool->pmd) < 0)

2916 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);

2917

2918 dm_bio_prison_destroy(pool->prison);

2919 dm_kcopyd_client_destroy(pool->copier);

2920

2921 cancel_delayed_work_sync(&pool->waker);

2922 cancel_delayed_work_sync(&pool->no_space_timeout);

2923 if (pool->wq)

2924 destroy_workqueue(pool->wq);

2925

2926 if (pool->next_mapping)

2927 mempool_free(pool->next_mapping, &pool->mapping_pool);

2928 mempool_exit(&pool->mapping_pool);

2929 dm_deferred_set_destroy(pool->shared_read_ds);

2930 dm_deferred_set_destroy(pool->all_io_ds);

2931 kfree(pool);

2932 }

2933

2934 static struct kmem_cache *_new_mapping_cache;

2935

2936 static struct pool *pool_create(struct mapped_device *pool_md,

2937 struct block_device *metadata_dev,

2938 struct block_device *data_dev,

2939 unsigned long block_size,

2940 int read_only, char **error)

2941 {

2942 int r;

2943 void *err_p;

2944 struct pool *pool;

2945 struct dm_pool_metadata *pmd;

2946 bool format_device = read_only ? false : true;

2947

2948 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);

2949 if (IS_ERR(pmd)) {

2950 *error = "Error creating metadata object";

2951 return ERR_CAST(pmd);

2952 }

2953

2954 pool = kzalloc(sizeof(*pool), GFP_KERNEL);

2955 if (!pool) {

2956 *error = "Error allocating memory for pool";

2957 err_p = ERR_PTR(-ENOMEM);

2958 goto bad_pool;

2959 }

2960

2961 pool->pmd = pmd;

2962 pool->sectors_per_block = block_size;

2963 if (block_size & (block_size - 1))

2964 pool->sectors_per_block_shift = -1;

2965 else

2966 pool->sectors_per_block_shift = __ffs(block_size);

2967 pool->low_water_blocks = 0;

2968 pool_features_init(&pool->pf);

2969 pool->prison = dm_bio_prison_create();

2970 if (!pool->prison) {

2971 *error = "Error creating pool's bio prison";

2972 err_p = ERR_PTR(-ENOMEM);

2973 goto bad_prison;

2974 }

2975

2976 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);

2977 if (IS_ERR(pool->copier)) {

2978 r = PTR_ERR(pool->copier);

2979 *error = "Error creating pool's kcopyd client";

2980 err_p = ERR_PTR(r);

2981 goto bad_kcopyd_client;

2982 }

2983

2984 /*

2985 * Create singlethreaded workqueue that will service all devices

2986 * that use this metadata.

2987 */

2988 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);

2989 if (!pool->wq) {

2990 *error = "Error creating pool's workqueue";

2991 err_p = ERR_PTR(-ENOMEM);

2992 goto bad_wq;

2993 }

2994

2995 throttle_init(&pool->throttle);

2996 INIT_WORK(&pool->worker, do_worker);

2997 INIT_DELAYED_WORK(&pool->waker, do_waker);

2998 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);

2999 spin_lock_init(&pool->lock);

3000 bio_list_init(&pool->deferred_flush_bios);

3001 bio_list_init(&pool->deferred_flush_completions);

3002 INIT_LIST_HEAD(&pool->prepared_mappings);

3003 INIT_LIST_HEAD(&pool->prepared_discards);

3004 INIT_LIST_HEAD(&pool->prepared_discards_pt2);

3005 INIT_LIST_HEAD(&pool->active_thins);

3006 pool->low_water_triggered = false;

3007 pool->suspended = true;

3008 pool->out_of_data_space = false;

3009

3010 pool->shared_read_ds = dm_deferred_set_create();

3011 if (!pool->shared_read_ds) {

3012 *error = "Error creating pool's shared read deferred set";

3013 err_p = ERR_PTR(-ENOMEM);

3014 goto bad_shared_read_ds;

3015 }

3016

3017 pool->all_io_ds = dm_deferred_set_create();

3018 if (!pool->all_io_ds) {

3019 *error = "Error creating pool's all io deferred set";

3020 err_p = ERR_PTR(-ENOMEM);

3021 goto bad_all_io_ds;

3022 }

3023

3024 pool->next_mapping = NULL;

3025 r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,

3026 _new_mapping_cache);

3027 if (r) {

3028 *error = "Error creating pool's mapping mempool";

3029 err_p = ERR_PTR(r);

3030 goto bad_mapping_pool;

3031 }

3032

3033 pool->cell_sort_array =

3034 vmalloc(array_size(CELL_SORT_ARRAY_SIZE,

3035 sizeof(*pool->cell_sort_array)));

3036 if (!pool->cell_sort_array) {

3037 *error = "Error allocating cell sort array";

3038 err_p = ERR_PTR(-ENOMEM);

3039 goto bad_sort_array;

3040 }

3041

3042 pool->ref_count = 1;

3043 pool->last_commit_jiffies = jiffies;

3044 pool->pool_md = pool_md;

3045 pool->md_dev = metadata_dev;

3046 pool->data_dev = data_dev;

3047 __pool_table_insert(pool);

3048

3049 return pool;

3050

3051 bad_sort_array:

3052 mempool_exit(&pool->mapping_pool);

3053 bad_mapping_pool:

3054 dm_deferred_set_destroy(pool->all_io_ds);

3055 bad_all_io_ds:

3056 dm_deferred_set_destroy(pool->shared_read_ds);

3057 bad_shared_read_ds:

3058 destroy_workqueue(pool->wq);

3059 bad_wq:

3060 dm_kcopyd_client_destroy(pool->copier);

3061 bad_kcopyd_client:

3062 dm_bio_prison_destroy(pool->prison);

3063 bad_prison:

3064 kfree(pool);

3065 bad_pool:

3066 if (dm_pool_metadata_close(pmd))

3067 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);

3068

3069 return err_p;

3070 }

3071

3072 static void __pool_inc(struct pool *pool)

3073 {

3074 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

3075 pool->ref_count++;

3076 }

3077

3078 static void __pool_dec(struct pool *pool)

3079 {

3080 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

3081 BUG_ON(!pool->ref_count);

3082 if (!--pool->ref_count)

3083 __pool_destroy(pool);

3084 }

3085

3086 static struct pool *__pool_find(struct mapped_device *pool_md,

3087 struct block_device *metadata_dev,

3088 struct block_device *data_dev,

3089 unsigned long block_size, int read_only,

3090 char **error, int *created)

3091 {

3092 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);

3093

3094 if (pool) {

3095 if (pool->pool_md != pool_md) {

3096 *error = "metadata device already in use by a pool";

3097 return ERR_PTR(-EBUSY);

3098 }

3099 if (pool->data_dev != data_dev) {

3100 *error = "data device already in use by a pool";

3101 return ERR_PTR(-EBUSY);

3102 }

3103 __pool_inc(pool);

3104

3105 } else {

3106 pool = __pool_table_lookup(pool_md);

3107 if (pool) {

3108 if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {

3109 *error = "different pool cannot replace a pool";

3110 return ERR_PTR(-EINVAL);

3111 }

3112 __pool_inc(pool);

3113

3114 } else {

3115 pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);

3116 *created = 1;

3117 }

3118 }

3119

3120 return pool;

3121 }

3122

3123 /*

3124 *--------------------------------------------------------------

3125 * Pool target methods

3126 *--------------------------------------------------------------

3127 */

3128 static void pool_dtr(struct dm_target *ti)

3129 {

3130 struct pool_c *pt = ti->private;

3131

3132 mutex_lock(&dm_thin_pool_table.mutex);

3133

3134 unbind_control_target(pt->pool, ti);

3135 __pool_dec(pt->pool);

3136 dm_put_device(ti, pt->metadata_dev);

3137 dm_put_device(ti, pt->data_dev);

3138 kfree(pt);

3139

3140 mutex_unlock(&dm_thin_pool_table.mutex);

3141 }

3142

3143 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,

3144 struct dm_target *ti)

3145 {

3146 int r;

3147 unsigned int argc;

3148 const char *arg_name;

3149

3150 static const struct dm_arg _args[] = {

3151 {0, 4, "Invalid number of pool feature arguments"},

3152 };

3153

3154 /*

3155 * No feature arguments supplied.

3156 */

3157 if (!as->argc)

3158 return 0;

3159

3160 r = dm_read_arg_group(_args, as, &argc, &ti->error);

3161 if (r)

3162 return -EINVAL;

3163

3164 while (argc && !r) {

3165 arg_name = dm_shift_arg(as);

3166 argc--;

3167

3168 if (!strcasecmp(arg_name, "skip_block_zeroing"))

3169 pf->zero_new_blocks = false;

3170

3171 else if (!strcasecmp(arg_name, "ignore_discard"))

3172 pf->discard_enabled = false;

3173

3174 else if (!strcasecmp(arg_name, "no_discard_passdown"))

3175 pf->discard_passdown = false;

3176

3177 else if (!strcasecmp(arg_name, "read_only"))

3178 pf->mode = PM_READ_ONLY;

3179

3180 else if (!strcasecmp(arg_name, "error_if_no_space"))

3181 pf->error_if_no_space = true;

3182

3183 else {

3184 ti->error = "Unrecognised pool feature requested";

3185 r = -EINVAL;

3186 break;

3187 }

3188 }

3189

3190 return r;

3191 }

3192

3193 static void metadata_low_callback(void *context)

3194 {

3195 struct pool *pool = context;

3196

3197 DMWARN("%s: reached low water mark for metadata device: sending event.",

3198 dm_device_name(pool->pool_md));

3199

3200 dm_table_event(pool->ti->table);

3201 }

3202

3203 /*

3204 * We need to flush the data device **before** committing the metadata.

3205 *

3206 * This ensures that the data blocks of any newly inserted mappings are

3207 * properly written to non-volatile storage and won't be lost in case of a

3208 * crash.

3209 *

3210 * Failure to do so can result in data corruption in the case of internal or

3211 * external snapshots and in the case of newly provisioned blocks, when block

3212 * zeroing is enabled.

3213 */

3214 static int metadata_pre_commit_callback(void *context)

3215 {

3216 struct pool *pool = context;

3217

3218 return blkdev_issue_flush(pool->data_dev);

3219 }

3220

3221 static sector_t get_dev_size(struct block_device *bdev)

3222 {

3223 return bdev_nr_sectors(bdev);

3224 }

3225

3226 static void warn_if_metadata_device_too_big(struct block_device *bdev)

3227 {

3228 sector_t metadata_dev_size = get_dev_size(bdev);

3229

3230 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)

3231 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",

3232 bdev, THIN_METADATA_MAX_SECTORS);

3233 }

3234

3235 static sector_t get_metadata_dev_size(struct block_device *bdev)

3236 {

3237 sector_t metadata_dev_size = get_dev_size(bdev);

3238

3239 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)

3240 metadata_dev_size = THIN_METADATA_MAX_SECTORS;

3241

3242 return metadata_dev_size;

3243 }

3244

3245 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)

3246 {

3247 sector_t metadata_dev_size = get_metadata_dev_size(bdev);

3248

3249 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);

3250

3251 return metadata_dev_size;

3252 }

3253

3254 /*

3255 * When a metadata threshold is crossed a dm event is triggered, and

3256 * userland should respond by growing the metadata device. We could let

3257 * userland set the threshold, like we do with the data threshold, but I'm

3258 * not sure they know enough to do this well.

3259 */

3260 static dm_block_t calc_metadata_threshold(struct pool_c *pt)

3261 {

3262 /*

3263 * 4M is ample for all ops with the possible exception of thin

3264 * device deletion which is harmless if it fails (just retry the

3265 * delete after you've grown the device).

3266 */

3267 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;

3268

3269 return min((dm_block_t)1024ULL /* 4M */, quarter);

3270 }

3271

3272 /*

3273 * thin-pool <metadata dev> <data dev>

3274 * <data block size (sectors)>

3275 * <low water mark (blocks)>

3276 * [<#feature args> [<arg>]*]

3277 *

3278 * Optional feature arguments are:

3279 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.

3280 * ignore_discard: disable discard

3281 * no_discard_passdown: don't pass discards down to the data device

3282 * read_only: Don't allow any changes to be made to the pool metadata.

3283 * error_if_no_space: error IOs, instead of queueing, if no space.

3284 */

3285 static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)

3286 {

3287 int r, pool_created = 0;

3288 struct pool_c *pt;

3289 struct pool *pool;

3290 struct pool_features pf;

3291 struct dm_arg_set as;

3292 struct dm_dev *data_dev;

3293 unsigned long block_size;

3294 dm_block_t low_water_blocks;

3295 struct dm_dev *metadata_dev;

3296 blk_mode_t metadata_mode;

3297

3298 /*

3299 * FIXME Remove validation from scope of lock.

3300 */

3301 mutex_lock(&dm_thin_pool_table.mutex);

3302

3303 if (argc < 4) {

3304 ti->error = "Invalid argument count";

3305 r = -EINVAL;

3306 goto out_unlock;

3307 }

3308

3309 as.argc = argc;

3310 as.argv = argv;

3311

3312 /* make sure metadata and data are different devices */

3313 if (!strcmp(argv[0], argv[1])) {

3314 ti->error = "Error setting metadata or data device";

3315 r = -EINVAL;

3316 goto out_unlock;

3317 }

3318

3319 /*

3320 * Set default pool features.

3321 */

3322 pool_features_init(&pf);

3323

3324 dm_consume_args(&as, 4);

3325 r = parse_pool_features(&as, &pf, ti);

3326 if (r)

3327 goto out_unlock;

3328

3329 metadata_mode = BLK_OPEN_READ |

3330 ((pf.mode == PM_READ_ONLY) ? 0 : BLK_OPEN_WRITE);

3331 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);

3332 if (r) {

3333 ti->error = "Error opening metadata block device";

3334 goto out_unlock;

3335 }

3336 warn_if_metadata_device_too_big(metadata_dev->bdev);

3337

3338 r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE, &data_dev);

3339 if (r) {

3340 ti->error = "Error getting data device";

3341 goto out_metadata;

3342 }

3343

3344 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||

3345 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||

3346 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||

3347 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {

3348 ti->error = "Invalid block size";

3349 r = -EINVAL;

3350 goto out;

3351 }

3352

3353 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {

3354 ti->error = "Invalid low water mark";

3355 r = -EINVAL;

3356 goto out;

3357 }

3358

3359 pt = kzalloc(sizeof(*pt), GFP_KERNEL);

3360 if (!pt) {

3361 r = -ENOMEM;

3362 goto out;

3363 }

3364

3365 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,

3366 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);

3367 if (IS_ERR(pool)) {

3368 r = PTR_ERR(pool);

3369 goto out_free_pt;

3370 }

3371

3372 /*

3373 * 'pool_created' reflects whether this is the first table load.

3374 * Top level discard support is not allowed to be changed after

3375 * initial load. This would require a pool reload to trigger thin

3376 * device changes.

3377 */

3378 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {

3379 ti->error = "Discard support cannot be disabled once enabled";

3380 r = -EINVAL;

3381 goto out_flags_changed;

3382 }

3383

3384 pt->pool = pool;

3385 pt->ti = ti;

3386 pt->metadata_dev = metadata_dev;

3387 pt->data_dev = data_dev;

3388 pt->low_water_blocks = low_water_blocks;

3389 pt->adjusted_pf = pt->requested_pf = pf;

3390 ti->num_flush_bios = 1;

3391 ti->limit_swap_bios = true;

3392

3393 /*

3394 * Only need to enable discards if the pool should pass

3395 * them down to the data device. The thin device's discard

3396 * processing will cause mappings to be removed from the btree.

3397 */

3398 if (pf.discard_enabled && pf.discard_passdown) {

3399 ti->num_discard_bios = 1;

3400 /*

3401 * Setting 'discards_supported' circumvents the normal

3402 * stacking of discard limits (this keeps the pool and

3403 * thin devices' discard limits consistent).

3404 */

3405 ti->discards_supported = true;

3406 ti->max_discard_granularity = true;

3407 }

3408 ti->private = pt;

3409

3410 r = dm_pool_register_metadata_threshold(pt->pool->pmd,

3411 calc_metadata_threshold(pt),

3412 metadata_low_callback,

3413 pool);

3414 if (r) {

3415 ti->error = "Error registering metadata threshold";

3416 goto out_flags_changed;

3417 }

3418

3419 dm_pool_register_pre_commit_callback(pool->pmd,

3420 metadata_pre_commit_callback, pool);

3421

3422 mutex_unlock(&dm_thin_pool_table.mutex);

3423

3424 return 0;

3425

3426 out_flags_changed:

3427 __pool_dec(pool);

3428 out_free_pt:

3429 kfree(pt);

3430 out:

3431 dm_put_device(ti, data_dev);

3432 out_metadata:

3433 dm_put_device(ti, metadata_dev);

3434 out_unlock:

3435 mutex_unlock(&dm_thin_pool_table.mutex);

3436

3437 return r;

3438 }

3439

3440 static int pool_map(struct dm_target *ti, struct bio *bio)

3441 {

3442 struct pool_c *pt = ti->private;

3443 struct pool *pool = pt->pool;

3444

3445 /*

3446 * As this is a singleton target, ti->begin is always zero.

3447 */

3448 spin_lock_irq(&pool->lock);

3449 bio_set_dev(bio, pt->data_dev->bdev);

3450 spin_unlock_irq(&pool->lock);

3451

3452 return DM_MAPIO_REMAPPED;

3453 }

3454

3455 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)

3456 {

3457 int r;

3458 struct pool_c *pt = ti->private;

3459 struct pool *pool = pt->pool;

3460 sector_t data_size = ti->len;

3461 dm_block_t sb_data_size;

3462

3463 *need_commit = false;

3464

3465 (void) sector_div(data_size, pool->sectors_per_block);

3466

3467 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);

3468 if (r) {

3469 DMERR("%s: failed to retrieve data device size",

3470 dm_device_name(pool->pool_md));

3471 return r;

3472 }

3473

3474 if (data_size < sb_data_size) {

3475 DMERR("%s: pool target (%llu blocks) too small: expected %llu",

3476 dm_device_name(pool->pool_md),

3477 (unsigned long long)data_size, sb_data_size);

3478 return -EINVAL;

3479

3480 } else if (data_size > sb_data_size) {

3481 if (dm_pool_metadata_needs_check(pool->pmd)) {

3482 DMERR("%s: unable to grow the data device until repaired.",

3483 dm_device_name(pool->pool_md));

3484 return 0;

3485 }

3486

3487 if (sb_data_size)

3488 DMINFO("%s: growing the data device from %llu to %llu blocks",

3489 dm_device_name(pool->pool_md),

3490 sb_data_size, (unsigned long long)data_size);

3491 r = dm_pool_resize_data_dev(pool->pmd, data_size);

3492 if (r) {

3493 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);

3494 return r;

3495 }

3496

3497 *need_commit = true;

3498 }

3499

3500 return 0;

3501 }

3502

3503 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)

3504 {

3505 int r;

3506 struct pool_c *pt = ti->private;

3507 struct pool *pool = pt->pool;

3508 dm_block_t metadata_dev_size, sb_metadata_dev_size;

3509

3510 *need_commit = false;

3511

3512 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);

3513

3514 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);

3515 if (r) {

3516 DMERR("%s: failed to retrieve metadata device size",

3517 dm_device_name(pool->pool_md));

3518 return r;

3519 }

3520

3521 if (metadata_dev_size < sb_metadata_dev_size) {

3522 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",

3523 dm_device_name(pool->pool_md),

3524 metadata_dev_size, sb_metadata_dev_size);

3525 return -EINVAL;

3526

3527 } else if (metadata_dev_size > sb_metadata_dev_size) {

3528 if (dm_pool_metadata_needs_check(pool->pmd)) {

3529 DMERR("%s: unable to grow the metadata device until repaired.",

3530 dm_device_name(pool->pool_md));

3531 return 0;

3532 }

3533

3534 warn_if_metadata_device_too_big(pool->md_dev);

3535 DMINFO("%s: growing the metadata device from %llu to %llu blocks",

3536 dm_device_name(pool->pool_md),

3537 sb_metadata_dev_size, metadata_dev_size);

3538

3539 if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)

3540 set_pool_mode(pool, PM_WRITE);

3541

3542 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);

3543 if (r) {

3544 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);

3545 return r;

3546 }

3547

3548 *need_commit = true;

3549 }

3550

3551 return 0;

3552 }

3553

3554 /*

3555 * Retrieves the number of blocks of the data device from

3556 * the superblock and compares it to the actual device size,

3557 * thus resizing the data device in case it has grown.

3558 *

3559 * This both copes with opening preallocated data devices in the ctr

3560 * being followed by a resume

3561 * -and-

3562 * calling the resume method individually after userspace has

3563 * grown the data device in reaction to a table event.

3564 */

3565 static int pool_preresume(struct dm_target *ti)

3566 {

3567 int r;

3568 bool need_commit1, need_commit2;

3569 struct pool_c *pt = ti->private;

3570 struct pool *pool = pt->pool;

3571

3572 /*

3573 * Take control of the pool object.

3574 */

3575 r = bind_control_target(pool, ti);

3576 if (r)

3577 goto out;

3578

3579 r = maybe_resize_data_dev(ti, &need_commit1);

3580 if (r)

3581 goto out;

3582

3583 r = maybe_resize_metadata_dev(ti, &need_commit2);

3584 if (r)

3585 goto out;

3586

3587 if (need_commit1 || need_commit2)

3588 (void) commit(pool);

3589 out:

3590 /*

3591 * When a thin-pool is PM_FAIL, it cannot be rebuilt if

3592 * bio is in deferred list. Therefore need to return 0

3593 * to allow pool_resume() to flush IO.

3594 */

3595 if (r && get_pool_mode(pool) == PM_FAIL)

3596 r = 0;

3597

3598 return r;

3599 }

3600

3601 static void pool_suspend_active_thins(struct pool *pool)

3602 {

3603 struct thin_c *tc;

3604

3605 /* Suspend all active thin devices */

3606 tc = get_first_thin(pool);

3607 while (tc) {

3608 dm_internal_suspend_noflush(tc->thin_md);

3609 tc = get_next_thin(pool, tc);

3610 }

3611 }

3612

3613 static void pool_resume_active_thins(struct pool *pool)

3614 {

3615 struct thin_c *tc;

3616

3617 /* Resume all active thin devices */

3618 tc = get_first_thin(pool);

3619 while (tc) {

3620 dm_internal_resume(tc->thin_md);

3621 tc = get_next_thin(pool, tc);

3622 }

3623 }

3624

3625 static void pool_resume(struct dm_target *ti)

3626 {

3627 struct pool_c *pt = ti->private;

3628 struct pool *pool = pt->pool;

3629

3630 /*

3631 * Must requeue active_thins' bios and then resume

3632 * active_thins _before_ clearing 'suspend' flag.

3633 */

3634 requeue_bios(pool);

3635 pool_resume_active_thins(pool);

3636

3637 spin_lock_irq(&pool->lock);

3638 pool->low_water_triggered = false;

3639 pool->suspended = false;

3640 spin_unlock_irq(&pool->lock);

3641

3642 do_waker(&pool->waker.work);

3643 }

3644

3645 static void pool_presuspend(struct dm_target *ti)

3646 {

3647 struct pool_c *pt = ti->private;

3648 struct pool *pool = pt->pool;

3649

3650 spin_lock_irq(&pool->lock);

3651 pool->suspended = true;

3652 spin_unlock_irq(&pool->lock);

3653

3654 pool_suspend_active_thins(pool);

3655 }

3656

3657 static void pool_presuspend_undo(struct dm_target *ti)

3658 {

3659 struct pool_c *pt = ti->private;

3660 struct pool *pool = pt->pool;

3661

3662 pool_resume_active_thins(pool);

3663

3664 spin_lock_irq(&pool->lock);

3665 pool->suspended = false;

3666 spin_unlock_irq(&pool->lock);

3667 }

3668

3669 static void pool_postsuspend(struct dm_target *ti)

3670 {

3671 struct pool_c *pt = ti->private;

3672 struct pool *pool = pt->pool;

3673

3674 cancel_delayed_work_sync(&pool->waker);

3675 cancel_delayed_work_sync(&pool->no_space_timeout);

3676 flush_workqueue(pool->wq);

3677 (void) commit(pool);

3678 }

3679

3680 static int check_arg_count(unsigned int argc, unsigned int args_required)

3681 {

3682 if (argc != args_required) {

3683 DMWARN("Message received with %u arguments instead of %u.",

3684 argc, args_required);

3685 return -EINVAL;

3686 }

3687

3688 return 0;

3689 }

3690

3691 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)

3692 {

3693 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&

3694 *dev_id <= MAX_DEV_ID)

3695 return 0;

3696

3697 if (warning)

3698 DMWARN("Message received with invalid device id: %s", arg);

3699

3700 return -EINVAL;

3701 }

3702

3703 static int process_create_thin_mesg(unsigned int argc, char **argv, struct pool *pool)

3704 {

3705 dm_thin_id dev_id;

3706 int r;

3707

3708 r = check_arg_count(argc, 2);

3709 if (r)

3710 return r;

3711

3712 r = read_dev_id(argv[1], &dev_id, 1);

3713 if (r)

3714 return r;

3715

3716 r = dm_pool_create_thin(pool->pmd, dev_id);

3717 if (r) {

3718 DMWARN("Creation of new thinly-provisioned device with id %s failed.",

3719 argv[1]);

3720 return r;

3721 }

3722

3723 return 0;

3724 }

3725

3726 static int process_create_snap_mesg(unsigned int argc, char **argv, struct pool *pool)

3727 {

3728 dm_thin_id dev_id;

3729 dm_thin_id origin_dev_id;

3730 int r;

3731

3732 r = check_arg_count(argc, 3);

3733 if (r)

3734 return r;

3735

3736 r = read_dev_id(argv[1], &dev_id, 1);

3737 if (r)

3738 return r;

3739

3740 r = read_dev_id(argv[2], &origin_dev_id, 1);

3741 if (r)

3742 return r;

3743

3744 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);

3745 if (r) {

3746 DMWARN("Creation of new snapshot %s of device %s failed.",

3747 argv[1], argv[2]);

3748 return r;

3749 }

3750

3751 return 0;

3752 }

3753

3754 static int process_delete_mesg(unsigned int argc, char **argv, struct pool *pool)

3755 {

3756 dm_thin_id dev_id;

3757 int r;

3758

3759 r = check_arg_count(argc, 2);

3760 if (r)

3761 return r;

3762

3763 r = read_dev_id(argv[1], &dev_id, 1);

3764 if (r)

3765 return r;

3766

3767 r = dm_pool_delete_thin_device(pool->pmd, dev_id);

3768 if (r)

3769 DMWARN("Deletion of thin device %s failed.", argv[1]);

3770

3771 return r;

3772 }

3773

3774 static int process_set_transaction_id_mesg(unsigned int argc, char **argv, struct pool *pool)

3775 {

3776 dm_thin_id old_id, new_id;

3777 int r;

3778

3779 r = check_arg_count(argc, 3);

3780 if (r)

3781 return r;

3782

3783 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {

3784 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);

3785 return -EINVAL;

3786 }

3787

3788 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {

3789 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);

3790 return -EINVAL;

3791 }

3792

3793 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);

3794 if (r) {

3795 DMWARN("Failed to change transaction id from %s to %s.",

3796 argv[1], argv[2]);

3797 return r;

3798 }

3799

3800 return 0;

3801 }

3802

3803 static int process_reserve_metadata_snap_mesg(unsigned int argc, char **argv, struct pool *pool)

3804 {

3805 int r;

3806

3807 r = check_arg_count(argc, 1);

3808 if (r)

3809 return r;

3810

3811 (void) commit(pool);

3812

3813 r = dm_pool_reserve_metadata_snap(pool->pmd);

3814 if (r)

3815 DMWARN("reserve_metadata_snap message failed.");

3816

3817 return r;

3818 }

3819

3820 static int process_release_metadata_snap_mesg(unsigned int argc, char **argv, struct pool *pool)

3821 {

3822 int r;

3823

3824 r = check_arg_count(argc, 1);

3825 if (r)

3826 return r;

3827

3828 r = dm_pool_release_metadata_snap(pool->pmd);

3829 if (r)

3830 DMWARN("release_metadata_snap message failed.");

3831

3832 return r;

3833 }

3834

3835 /*

3836 * Messages supported:

3837 * create_thin <dev_id>

3838 * create_snap <dev_id> <origin_id>

3839 * delete <dev_id>

3840 * set_transaction_id <current_trans_id> <new_trans_id>

3841 * reserve_metadata_snap

3842 * release_metadata_snap

3843 */

3844 static int pool_message(struct dm_target *ti, unsigned int argc, char **argv,

3845 char *result, unsigned int maxlen)

3846 {

3847 int r = -EINVAL;

3848 struct pool_c *pt = ti->private;

3849 struct pool *pool = pt->pool;

3850

3851 if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {

3852 DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",

3853 dm_device_name(pool->pool_md));

3854 return -EOPNOTSUPP;

3855 }

3856

3857 if (!strcasecmp(argv[0], "create_thin"))

3858 r = process_create_thin_mesg(argc, argv, pool);

3859

3860 else if (!strcasecmp(argv[0], "create_snap"))

3861 r = process_create_snap_mesg(argc, argv, pool);

3862

3863 else if (!strcasecmp(argv[0], "delete"))

3864 r = process_delete_mesg(argc, argv, pool);

3865

3866 else if (!strcasecmp(argv[0], "set_transaction_id"))

3867 r = process_set_transaction_id_mesg(argc, argv, pool);

3868

3869 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))

3870 r = process_reserve_metadata_snap_mesg(argc, argv, pool);

3871

3872 else if (!strcasecmp(argv[0], "release_metadata_snap"))

3873 r = process_release_metadata_snap_mesg(argc, argv, pool);

3874

3875 else

3876 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);

3877

3878 if (!r)

3879 (void) commit(pool);

3880

3881 return r;

3882 }

3883

3884 static void emit_flags(struct pool_features *pf, char *result,

3885 unsigned int sz, unsigned int maxlen)

3886 {

3887 unsigned int count = !pf->zero_new_blocks + !pf->discard_enabled +

3888 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +

3889 pf->error_if_no_space;

3890 DMEMIT("%u ", count);

3891

3892 if (!pf->zero_new_blocks)

3893 DMEMIT("skip_block_zeroing ");

3894

3895 if (!pf->discard_enabled)

3896 DMEMIT("ignore_discard ");

3897

3898 if (!pf->discard_passdown)

3899 DMEMIT("no_discard_passdown ");

3900

3901 if (pf->mode == PM_READ_ONLY)

3902 DMEMIT("read_only ");

3903

3904 if (pf->error_if_no_space)

3905 DMEMIT("error_if_no_space ");

3906 }

3907

3908 /*

3909 * Status line is:

3910 * <transaction id> <used metadata sectors>/<total metadata sectors>

3911 * <used data sectors>/<total data sectors> <held metadata root>

3912 * <pool mode> <discard config> <no space config> <needs_check>

3913 */

3914 static void pool_status(struct dm_target *ti, status_type_t type,

3915 unsigned int status_flags, char *result, unsigned int maxlen)

3916 {

3917 int r;

3918 unsigned int sz = 0;

3919 uint64_t transaction_id;

3920 dm_block_t nr_free_blocks_data;

3921 dm_block_t nr_free_blocks_metadata;

3922 dm_block_t nr_blocks_data;

3923 dm_block_t nr_blocks_metadata;

3924 dm_block_t held_root;

3925 enum pool_mode mode;

3926 char buf[BDEVNAME_SIZE];

3927 char buf2[BDEVNAME_SIZE];

3928 struct pool_c *pt = ti->private;

3929 struct pool *pool = pt->pool;

3930

3931 switch (type) {

3932 case STATUSTYPE_INFO:

3933 if (get_pool_mode(pool) == PM_FAIL) {

3934 DMEMIT("Fail");

3935 break;

3936 }

3937

3938 /* Commit to ensure statistics aren't out-of-date */

3939 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))

3940 (void) commit(pool);

3941

3942 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);

3943 if (r) {

3944 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",

3945 dm_device_name(pool->pool_md), r);

3946 goto err;

3947 }

3948

3949 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);

3950 if (r) {

3951 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",

3952 dm_device_name(pool->pool_md), r);

3953 goto err;

3954 }

3955

3956 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);

3957 if (r) {

3958 DMERR("%s: dm_pool_get_metadata_dev_size returned %d",

3959 dm_device_name(pool->pool_md), r);

3960 goto err;

3961 }

3962

3963 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);

3964 if (r) {

3965 DMERR("%s: dm_pool_get_free_block_count returned %d",

3966 dm_device_name(pool->pool_md), r);

3967 goto err;

3968 }

3969

3970 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);

3971 if (r) {

3972 DMERR("%s: dm_pool_get_data_dev_size returned %d",

3973 dm_device_name(pool->pool_md), r);

3974 goto err;

3975 }

3976

3977 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);

3978 if (r) {

3979 DMERR("%s: dm_pool_get_metadata_snap returned %d",

3980 dm_device_name(pool->pool_md), r);

3981 goto err;

3982 }

3983

3984 DMEMIT("%llu %llu/%llu %llu/%llu ",

3985 (unsigned long long)transaction_id,

3986 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),

3987 (unsigned long long)nr_blocks_metadata,

3988 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),

3989 (unsigned long long)nr_blocks_data);

3990

3991 if (held_root)

3992 DMEMIT("%llu ", held_root);

3993 else

3994 DMEMIT("- ");

3995

3996 mode = get_pool_mode(pool);

3997 if (mode == PM_OUT_OF_DATA_SPACE)

3998 DMEMIT("out_of_data_space ");

3999 else if (is_read_only_pool_mode(mode))

4000 DMEMIT("ro ");

4001 else

4002 DMEMIT("rw ");

4003

4004 if (!pool->pf.discard_enabled)

4005 DMEMIT("ignore_discard ");

4006 else if (pool->pf.discard_passdown)

4007 DMEMIT("discard_passdown ");

4008 else

4009 DMEMIT("no_discard_passdown ");

4010

4011 if (pool->pf.error_if_no_space)

4012 DMEMIT("error_if_no_space ");

4013 else

4014 DMEMIT("queue_if_no_space ");

4015

4016 if (dm_pool_metadata_needs_check(pool->pmd))

4017 DMEMIT("needs_check ");

4018 else

4019 DMEMIT("- ");

4020

4021 DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));

4022

4023 break;

4024

4025 case STATUSTYPE_TABLE:

4026 DMEMIT("%s %s %lu %llu ",

4027 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),

4028 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),

4029 (unsigned long)pool->sectors_per_block,

4030 (unsigned long long)pt->low_water_blocks);

4031 emit_flags(&pt->requested_pf, result, sz, maxlen);

4032 break;

4033

4034 case STATUSTYPE_IMA:

4035 *result = '\0';

4036 break;

4037 }

4038 return;

4039

4040 err:

4041 DMEMIT("Error");

4042 }

4043

4044 static int pool_iterate_devices(struct dm_target *ti,

4045 iterate_devices_callout_fn fn, void *data)

4046 {

4047 struct pool_c *pt = ti->private;

4048

4049 return fn(ti, pt->data_dev, 0, ti->len, data);

4050 }

4051

4052 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)

4053 {

4054 struct pool_c *pt = ti->private;

4055 struct pool *pool = pt->pool;

4056 sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;

4057

4058 /*

4059 * If max_sectors is smaller than pool->sectors_per_block adjust it

4060 * to the highest possible power-of-2 factor of pool->sectors_per_block.

4061 * This is especially beneficial when the pool's data device is a RAID

4062 * device that has a full stripe width that matches pool->sectors_per_block

4063 * -- because even though partial RAID stripe-sized IOs will be issued to a

4064 * single RAID stripe; when aggregated they will end on a full RAID stripe

4065 * boundary.. which avoids additional partial RAID stripe writes cascading

4066 */

4067 if (limits->max_sectors < pool->sectors_per_block) {

4068 while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {

4069 if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)

4070 limits->max_sectors--;

4071 limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);

4072 }

4073 }

4074

4075 /*

4076 * If the system-determined stacked limits are compatible with the

4077 * pool's blocksize (io_opt is a factor) do not override them.

4078 */

4079 if (io_opt_sectors < pool->sectors_per_block ||

4080 !is_factor(io_opt_sectors, pool->sectors_per_block)) {

4081 if (is_factor(pool->sectors_per_block, limits->max_sectors))

4082 limits->io_min = limits->max_sectors << SECTOR_SHIFT;

4083 else

4084 limits->io_min = pool->sectors_per_block << SECTOR_SHIFT;

4085 limits->io_opt = pool->sectors_per_block << SECTOR_SHIFT;

4086 }

4087

4088 /*

4089 * pt->adjusted_pf is a staging area for the actual features to use.

4090 * They get transferred to the live pool in bind_control_target()

4091 * called from pool_preresume().

4092 */

4093

4094 if (pt->adjusted_pf.discard_enabled) {

4095 disable_discard_passdown_if_not_supported(pt);

4096 if (!pt->adjusted_pf.discard_passdown)

4097 limits->max_hw_discard_sectors = 0;

4098 /*

4099 * The pool uses the same discard limits as the underlying data

4100 * device. DM core has already set this up.

4101 */

4102 } else {

4103 /*

4104 * Must explicitly disallow stacking discard limits otherwise the

4105 * block layer will stack them if pool's data device has support.

4106 */

4107 limits->discard_granularity = 0;

4108 }

4109 }

4110

4111 static struct target_type pool_target = {

4112 .name = "thin-pool",

4113 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |

4114 DM_TARGET_IMMUTABLE,

4115 .version = {1, 23, 0},

4116 .module = THIS_MODULE,

4117 .ctr = pool_ctr,

4118 .dtr = pool_dtr,

4119 .map = pool_map,

4120 .presuspend = pool_presuspend,

4121 .presuspend_undo = pool_presuspend_undo,

4122 .postsuspend = pool_postsuspend,

4123 .preresume = pool_preresume,

4124 .resume = pool_resume,

4125 .message = pool_message,

4126 .status = pool_status,

4127 .iterate_devices = pool_iterate_devices,

4128 .io_hints = pool_io_hints,

4129 };

4130

4131 /*

4132 *--------------------------------------------------------------

4133 * Thin target methods

4134 *--------------------------------------------------------------

4135 */

4136 static void thin_get(struct thin_c *tc)

4137 {

4138 refcount_inc(&tc->refcount);

4139 }

4140

4141 static void thin_put(struct thin_c *tc)

4142 {

4143 if (refcount_dec_and_test(&tc->refcount))

4144 complete(&tc->can_destroy);

4145 }

4146

4147 static void thin_dtr(struct dm_target *ti)

4148 {

4149 struct thin_c *tc = ti->private;

4150

4151 spin_lock_irq(&tc->pool->lock);

4152 list_del_rcu(&tc->list);

4153 spin_unlock_irq(&tc->pool->lock);

4154 synchronize_rcu();

4155

4156 thin_put(tc);

4157 wait_for_completion(&tc->can_destroy);

4158

4159 mutex_lock(&dm_thin_pool_table.mutex);

4160

4161 __pool_dec(tc->pool);

4162 dm_pool_close_thin_device(tc->td);

4163 dm_put_device(ti, tc->pool_dev);

4164 if (tc->origin_dev)

4165 dm_put_device(ti, tc->origin_dev);

4166 kfree(tc);

4167

4168 mutex_unlock(&dm_thin_pool_table.mutex);

4169 }

4170

4171 /*

4172 * Thin target parameters:

4173 *

4174 * <pool_dev> <dev_id> [origin_dev]

4175 *

4176 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)

4177 * dev_id: the internal device identifier

4178 * origin_dev: a device external to the pool that should act as the origin

4179 *

4180 * If the pool device has discards disabled, they get disabled for the thin

4181 * device as well.

4182 */

4183 static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)

4184 {

4185 int r;

4186 struct thin_c *tc;

4187 struct dm_dev *pool_dev, *origin_dev;

4188 struct mapped_device *pool_md;

4189

4190 mutex_lock(&dm_thin_pool_table.mutex);

4191

4192 if (argc != 2 && argc != 3) {

4193 ti->error = "Invalid argument count";

4194 r = -EINVAL;

4195 goto out_unlock;

4196 }

4197

4198 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);

4199 if (!tc) {

4200 ti->error = "Out of memory";

4201 r = -ENOMEM;

4202 goto out_unlock;

4203 }

4204 tc->thin_md = dm_table_get_md(ti->table);

4205 spin_lock_init(&tc->lock);

4206 INIT_LIST_HEAD(&tc->deferred_cells);

4207 bio_list_init(&tc->deferred_bio_list);

4208 bio_list_init(&tc->retry_on_resume_list);

4209 tc->sort_bio_list = RB_ROOT;

4210

4211 if (argc == 3) {

4212 if (!strcmp(argv[0], argv[2])) {

4213 ti->error = "Error setting origin device";

4214 r = -EINVAL;

4215 goto bad_origin_dev;

4216 }

4217

4218 r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &origin_dev);

4219 if (r) {

4220 ti->error = "Error opening origin device";

4221 goto bad_origin_dev;

4222 }

4223 tc->origin_dev = origin_dev;

4224 }

4225

4226 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);

4227 if (r) {

4228 ti->error = "Error opening pool device";

4229 goto bad_pool_dev;

4230 }

4231 tc->pool_dev = pool_dev;

4232

4233 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {

4234 ti->error = "Invalid device id";

4235 r = -EINVAL;

4236 goto bad_common;

4237 }

4238

4239 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);

4240 if (!pool_md) {

4241 ti->error = "Couldn't get pool mapped device";

4242 r = -EINVAL;

4243 goto bad_common;

4244 }

4245

4246 tc->pool = __pool_table_lookup(pool_md);

4247 if (!tc->pool) {

4248 ti->error = "Couldn't find pool object";

4249 r = -EINVAL;

4250 goto bad_pool_lookup;

4251 }

4252 __pool_inc(tc->pool);

4253

4254 if (get_pool_mode(tc->pool) == PM_FAIL) {

4255 ti->error = "Couldn't open thin device, Pool is in fail mode";

4256 r = -EINVAL;

4257 goto bad_pool;

4258 }

4259

4260 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);

4261 if (r) {

4262 ti->error = "Couldn't open thin internal device";

4263 goto bad_pool;

4264 }

4265

4266 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);

4267 if (r)

4268 goto bad;

4269

4270 ti->num_flush_bios = 1;

4271 ti->limit_swap_bios = true;

4272 ti->flush_supported = true;

4273 ti->accounts_remapped_io = true;

4274 ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);

4275

4276 /* In case the pool supports discards, pass them on. */

4277 if (tc->pool->pf.discard_enabled) {

4278 ti->discards_supported = true;

4279 ti->num_discard_bios = 1;

4280 ti->max_discard_granularity = true;

4281 }

4282

4283 mutex_unlock(&dm_thin_pool_table.mutex);

4284

4285 spin_lock_irq(&tc->pool->lock);

4286 if (tc->pool->suspended) {

4287 spin_unlock_irq(&tc->pool->lock);

4288 mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */

4289 ti->error = "Unable to activate thin device while pool is suspended";

4290 r = -EINVAL;

4291 goto bad;

4292 }

4293 refcount_set(&tc->refcount, 1);

4294 init_completion(&tc->can_destroy);

4295 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);

4296 spin_unlock_irq(&tc->pool->lock);

4297 /*

4298 * This synchronize_rcu() call is needed here otherwise we risk a

4299 * wake_worker() call finding no bios to process (because the newly

4300 * added tc isn't yet visible). So this reduces latency since we

4301 * aren't then dependent on the periodic commit to wake_worker().

4302 */

4303 synchronize_rcu();

4304

4305 dm_put(pool_md);

4306

4307 return 0;

4308

4309 bad:

4310 dm_pool_close_thin_device(tc->td);

4311 bad_pool:

4312 __pool_dec(tc->pool);

4313 bad_pool_lookup:

4314 dm_put(pool_md);

4315 bad_common:

4316 dm_put_device(ti, tc->pool_dev);

4317 bad_pool_dev:

4318 if (tc->origin_dev)

4319 dm_put_device(ti, tc->origin_dev);

4320 bad_origin_dev:

4321 kfree(tc);

4322 out_unlock:

4323 mutex_unlock(&dm_thin_pool_table.mutex);

4324

4325 return r;

4326 }

4327

4328 static int thin_map(struct dm_target *ti, struct bio *bio)

4329 {

4330 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);

4331

4332 return thin_bio_map(ti, bio);

4333 }

4334

4335 static int thin_endio(struct dm_target *ti, struct bio *bio,

4336 blk_status_t *err)

4337 {

4338 unsigned long flags;

4339 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

4340 struct list_head work;

4341 struct dm_thin_new_mapping *m, *tmp;

4342 struct pool *pool = h->tc->pool;

4343

4344 if (h->shared_read_entry) {

4345 INIT_LIST_HEAD(&work);

4346 dm_deferred_entry_dec(h->shared_read_entry, &work);

4347

4348 spin_lock_irqsave(&pool->lock, flags);

4349 list_for_each_entry_safe(m, tmp, &work, list) {

4350 list_del(&m->list);

4351 __complete_mapping_preparation(m);

4352 }

4353 spin_unlock_irqrestore(&pool->lock, flags);

4354 }

4355

4356 if (h->all_io_entry) {

4357 INIT_LIST_HEAD(&work);

4358 dm_deferred_entry_dec(h->all_io_entry, &work);

4359 if (!list_empty(&work)) {

4360 spin_lock_irqsave(&pool->lock, flags);

4361 list_for_each_entry_safe(m, tmp, &work, list)

4362 list_add_tail(&m->list, &pool->prepared_discards);

4363 spin_unlock_irqrestore(&pool->lock, flags);

4364 wake_worker(pool);

4365 }

4366 }

4367

4368 if (h->cell)

4369 cell_defer_no_holder(h->tc, h->cell);

4370

4371 return DM_ENDIO_DONE;

4372 }

4373

4374 static void thin_presuspend(struct dm_target *ti)

4375 {

4376 struct thin_c *tc = ti->private;

4377

4378 if (dm_noflush_suspending(ti))

4379 noflush_work(tc, do_noflush_start);

4380 }

4381

4382 static void thin_postsuspend(struct dm_target *ti)

4383 {

4384 struct thin_c *tc = ti->private;

4385

4386 /*

4387 * The dm_noflush_suspending flag has been cleared by now, so

4388 * unfortunately we must always run this.

4389 */

4390 noflush_work(tc, do_noflush_stop);

4391 }

4392

4393 static int thin_preresume(struct dm_target *ti)

4394 {

4395 struct thin_c *tc = ti->private;

4396

4397 if (tc->origin_dev)

4398 tc->origin_size = get_dev_size(tc->origin_dev->bdev);

4399

4400 return 0;

4401 }

4402

4403 /*

4404 * <nr mapped sectors> <highest mapped sector>

4405 */

4406 static void thin_status(struct dm_target *ti, status_type_t type,

4407 unsigned int status_flags, char *result, unsigned int maxlen)

4408 {

4409 int r;

4410 ssize_t sz = 0;

4411 dm_block_t mapped, highest;

4412 char buf[BDEVNAME_SIZE];

4413 struct thin_c *tc = ti->private;

4414

4415 if (get_pool_mode(tc->pool) == PM_FAIL) {

4416 DMEMIT("Fail");

4417 return;

4418 }

4419

4420 if (!tc->td)

4421 DMEMIT("-");

4422 else {

4423 switch (type) {

4424 case STATUSTYPE_INFO:

4425 r = dm_thin_get_mapped_count(tc->td, &mapped);

4426 if (r) {

4427 DMERR("dm_thin_get_mapped_count returned %d", r);

4428 goto err;

4429 }

4430

4431 r = dm_thin_get_highest_mapped_block(tc->td, &highest);

4432 if (r < 0) {

4433 DMERR("dm_thin_get_highest_mapped_block returned %d", r);

4434 goto err;

4435 }

4436

4437 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);

4438 if (r)

4439 DMEMIT("%llu", ((highest + 1) *

4440 tc->pool->sectors_per_block) - 1);

4441 else

4442 DMEMIT("-");

4443 break;

4444

4445 case STATUSTYPE_TABLE:

4446 DMEMIT("%s %lu",

4447 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),

4448 (unsigned long) tc->dev_id);

4449 if (tc->origin_dev)

4450 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));

4451 break;

4452

4453 case STATUSTYPE_IMA:

4454 *result = '\0';

4455 break;

4456 }

4457 }

4458

4459 return;

4460

4461 err:

4462 DMEMIT("Error");

4463 }

4464

4465 static int thin_iterate_devices(struct dm_target *ti,

4466 iterate_devices_callout_fn fn, void *data)

4467 {

4468 sector_t blocks;

4469 struct thin_c *tc = ti->private;

4470 struct pool *pool = tc->pool;

4471

4472 /*

4473 * We can't call dm_pool_get_data_dev_size() since that blocks. So

4474 * we follow a more convoluted path through to the pool's target.

4475 */

4476 if (!pool->ti)

4477 return 0; /* nothing is bound */

4478

4479 blocks = pool->ti->len;

4480 (void) sector_div(blocks, pool->sectors_per_block);

4481 if (blocks)

4482 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);

4483

4484 return 0;

4485 }

4486

4487 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)

4488 {

4489 struct thin_c *tc = ti->private;

4490 struct pool *pool = tc->pool;

4491

4492 if (pool->pf.discard_enabled) {

4493 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;

4494 limits->max_hw_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;

4495 }

4496 }

4497

4498 static struct target_type thin_target = {

4499 .name = "thin",

4500 .version = {1, 23, 0},

4501 .module = THIS_MODULE,

4502 .ctr = thin_ctr,

4503 .dtr = thin_dtr,

4504 .map = thin_map,

4505 .end_io = thin_endio,

4506 .preresume = thin_preresume,

4507 .presuspend = thin_presuspend,

4508 .postsuspend = thin_postsuspend,

4509 .status = thin_status,

4510 .iterate_devices = thin_iterate_devices,

4511 .io_hints = thin_io_hints,

4512 };

4513

4514 /*----------------------------------------------------------------*/

4515

4516 static int __init dm_thin_init(void)

4517 {

4518 int r = -ENOMEM;

4519

4520 pool_table_init();

4521

4522 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);

4523 if (!_new_mapping_cache)

4524 return r;

4525

4526 r = dm_register_target(&thin_target);

4527 if (r)

4528 goto bad_new_mapping_cache;

4529

4530 r = dm_register_target(&pool_target);

4531 if (r)

4532 goto bad_thin_target;

4533

4534 return 0;

4535

4536 bad_thin_target:

4537 dm_unregister_target(&thin_target);

4538 bad_new_mapping_cache:

4539 kmem_cache_destroy(_new_mapping_cache);

4540

4541 return r;

4542 }

4543

4544 static void dm_thin_exit(void)

4545 {

4546 dm_unregister_target(&thin_target);

4547 dm_unregister_target(&pool_target);

4548

4549 kmem_cache_destroy(_new_mapping_cache);

4550

4551 pool_table_exit();

4552 }

4553

4554 module_init(dm_thin_init);

4555 module_exit(dm_thin_exit);

4556

4557 module_param_named(no_space_timeout, no_space_timeout_secs, uint, 0644);

4558 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");

4559

4560 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");

4561 MODULE_AUTHOR("Joe Thornber <[email protected]>");

4562 MODULE_LICENSE("GPL");