Git Repo - linux.git/blame - drivers/md/dm-thin.c

Commit	Line	Data
3bd94003	1	// SPDX-License-Identifier: GPL-2.0-only
991d9fa0	2	/*
e49e5829	3	* Copyright (C) 2011-2012 Red Hat UK.
991d9fa0 JT	4	*
	5	* This file is released under the GPL.
	6	*/
	7
	8	#include "dm-thin-metadata.h"
742c8fdc	9	#include "dm-bio-prison-v1.h"
1f4e0ff0	10	#include "dm.h"
991d9fa0 JT	11
	12	#include <linux/device-mapper.h>
	13	#include <linux/dm-io.h>
	14	#include <linux/dm-kcopyd.h>
0f30af98	15	#include <linux/jiffies.h>
604ea906	16	#include <linux/log2.h>
991d9fa0	17	#include <linux/list.h>
c140e1c4	18	#include <linux/rculist.h>
991d9fa0 JT	19	#include <linux/init.h>
	20	#include <linux/module.h>
	21	#include <linux/slab.h>
a822c83e	22	#include <linux/vmalloc.h>
ac4c3f34	23	#include <linux/sort.h>
67324ea1	24	#include <linux/rbtree.h>
991d9fa0 JT	25
	26	#define DM_MSG_PREFIX "thin"
	27
	28	/*
	29	* Tunable constants
	30	*/
7768ed33	31	#define ENDIO_HOOK_POOL_SIZE 1024
991d9fa0	32	#define MAPPING_POOL_SIZE 1024
905e51b3	33	#define COMMIT_PERIOD HZ
80c57893 MS	34	#define NO_SPACE_TIMEOUT_SECS 60
80c57893 MS	35
86a3238c	36	static unsigned int no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
991d9fa0	37
df5d2e90 MP	38	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
	39	"A percentage of time allocated for copy on write");
	40
991d9fa0 JT	41	/*
	42	* The block size of the device holding pool data must be
	43	* between 64KB and 1GB.
	44	*/
	45	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
	46	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
	47
991d9fa0 JT	48	/*
	49	* Device id is restricted to 24 bits.
	50	*/
	51	#define MAX_DEV_ID ((1 << 24) - 1)
	52
	53	/*
	54	* How do we handle breaking sharing of data blocks?
	55	* =================================================
	56	*
	57	* We use a standard copy-on-write btree to store the mappings for the
	58	* devices (note I'm talking about copy-on-write of the metadata here, not
	59	* the data). When you take an internal snapshot you clone the root node
	60	* of the origin btree. After this there is no concept of an origin or a
	61	* snapshot. They are just two device trees that happen to point to the
	62	* same data blocks.
	63	*
	64	* When we get a write in we decide if it's to a shared data block using
	65	* some timestamp magic. If it is, we have to break sharing.
	66	*
	67	* Let's say we write to a shared block in what was the origin. The
	68	* steps are:
	69	*
	70	* i) plug io further to this physical block. (see bio_prison code).
	71	*
	72	* ii) quiesce any read io to that shared data block. Obviously
44feb387	73	* including all devices that share this block. (see dm_deferred_set code)
991d9fa0 JT	74	*
	75	* iii) copy the data block to a newly allocate block. This step can be
	76	* missed out if the io covers the block. (schedule_copy).
	77	*
	78	* iv) insert the new mapping into the origin's btree
fe878f34	79	* (process_prepared_mapping). This act of inserting breaks some
991d9fa0 JT	80	* sharing of btree nodes between the two devices. Breaking sharing only
	81	* effects the btree of that specific device. Btrees for the other
	82	* devices that share the block never change. The btree for the origin
	83	* device as it was after the last commit is untouched, ie. we're using
	84	* persistent data structures in the functional programming sense.
	85	*
	86	* v) unplug io to this physical block, including the io that triggered
	87	* the breaking of sharing.
	88	*
	89	* Steps (ii) and (iii) occur in parallel.
	90	*
	91	* The metadata _doesn't_ need to be committed before the io continues. We
	92	* get away with this because the io is always written to a _new_ block.
	93	* If there's a crash, then:
	94	*
	95	* - The origin mapping will point to the old origin block (the shared
	96	* one). This will contain the data as it was before the io that triggered
	97	* the breaking of sharing came in.
	98	*
	99	* - The snap mapping still points to the old block. As it would after
	100	* the commit.
	101	*
	102	* The downside of this scheme is the timestamp magic isn't perfect, and
	103	* will continue to think that data block in the snapshot device is shared
	104	* even after the write to the origin has broken sharing. I suspect data
	105	* blocks will typically be shared by many different devices, so we're
	106	* breaking sharing n + 1 times, rather than n, where n is the number of
	107	* devices that reference this data block. At the moment I think the
	108	* benefits far, far outweigh the disadvantages.
	109	*/
	110
	111	/----------------------------------------------------------------/
	112
991d9fa0 JT	113	/*
	114	* Key building.
	115	*/
34fbcf62 JT	116	enum lock_space {
	117	VIRTUAL,
	118	PHYSICAL
	119	};
	120
3f8d3f54	121	static bool build_key(struct dm_thin_device *td, enum lock_space ls,
34fbcf62	122	dm_block_t b, dm_block_t e, struct dm_cell_key *key)
991d9fa0	123	{
34fbcf62	124	key->virtual = (ls == VIRTUAL);
991d9fa0	125	key->dev = dm_thin_dev_id(td);
5f274d88	126	key->block_begin = b;
34fbcf62	127	key->block_end = e;
3f8d3f54 MS	128
3f8d3f54 MS	129	return dm_cell_key_has_valid_range(key);
34fbcf62 JT	130	}
	131
	132	static void build_data_key(struct dm_thin_device *td, dm_block_t b,
	133	struct dm_cell_key *key)
	134	{
3f8d3f54	135	(void) build_key(td, PHYSICAL, b, b + 1llu, key);
991d9fa0 JT	136	}
	137
	138	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
44feb387	139	struct dm_cell_key *key)
991d9fa0	140	{
3f8d3f54	141	(void) build_key(td, VIRTUAL, b, b + 1llu, key);
991d9fa0 JT	142	}
	143
	144	/----------------------------------------------------------------/
	145
7d327fe0 JT	146	#define THROTTLE_THRESHOLD (1 * HZ)
	147
	148	struct throttle {
	149	struct rw_semaphore lock;
	150	unsigned long threshold;
	151	bool throttle_applied;
	152	};
	153
	154	static void throttle_init(struct throttle *t)
	155	{
	156	init_rwsem(&t->lock);
	157	t->throttle_applied = false;
	158	}
	159
	160	static void throttle_work_start(struct throttle *t)
	161	{
	162	t->threshold = jiffies + THROTTLE_THRESHOLD;
	163	}
	164
	165	static void throttle_work_update(struct throttle *t)
	166	{
8ca8b1e1	167	if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) {
7d327fe0 JT	168	down_write(&t->lock);
	169	t->throttle_applied = true;
	170	}
	171	}
	172
	173	static void throttle_work_complete(struct throttle *t)
	174	{
	175	if (t->throttle_applied) {
	176	t->throttle_applied = false;
	177	up_write(&t->lock);
	178	}
	179	}
	180
	181	static void throttle_lock(struct throttle *t)
	182	{
	183	down_read(&t->lock);
	184	}
	185
	186	static void throttle_unlock(struct throttle *t)
	187	{
	188	up_read(&t->lock);
	189	}
	190
	191	/----------------------------------------------------------------/
	192
991d9fa0 JT	193	/*
	194	* A pool device ties together a metadata device and a data device. It
	195	* also provides the interface for creating and destroying internal
	196	* devices.
	197	*/
a24c2569	198	struct dm_thin_new_mapping;
67e2e2b2	199
e49e5829	200	/*
f6c36758	201	* The pool runs in various modes. Ordered in degraded order for comparisons.
e49e5829 JT	202	*/
	203	enum pool_mode {
	204	PM_WRITE, /* metadata may be changed */
3e1a0699	205	PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
3ab91828 JT	206
	207	/*
	208	* Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
	209	*/
	210	PM_OUT_OF_METADATA_SPACE,
e49e5829	211	PM_READ_ONLY, /* metadata may not be changed */
3ab91828	212
e49e5829 JT	213	PM_FAIL, /* all I/O fails */
	214	};
	215
67e2e2b2	216	struct pool_features {
e49e5829 JT	217	enum pool_mode mode;
e49e5829 JT	218
9bc142dd MS	219	bool zero_new_blocks:1;
	220	bool discard_enabled:1;
	221	bool discard_passdown:1;
787a996c	222	bool error_if_no_space:1;
67e2e2b2 JT	223	};
67e2e2b2 JT	224
e49e5829 JT	225	struct thin_c;
e49e5829 JT	226	typedef void (process_bio_fn)(struct thin_c tc, struct bio *bio);
a374bb21	227	typedef void (process_cell_fn)(struct thin_c tc, struct dm_bio_prison_cell *cell);
e49e5829 JT	228	typedef void (process_mapping_fn)(struct dm_thin_new_mapping m);
e49e5829 JT	229
ac4c3f34 JT	230	#define CELL_SORT_ARRAY_SIZE 8192
ac4c3f34 JT	231
991d9fa0 JT	232	struct pool {
	233	struct list_head list;
	234	struct dm_target ti; / Only set if a pool target is bound */
	235
	236	struct mapped_device *pool_md;
873937e7	237	struct block_device *data_dev;
991d9fa0 JT	238	struct block_device *md_dev;
	239	struct dm_pool_metadata *pmd;
	240
991d9fa0	241	dm_block_t low_water_blocks;
55f2b8bd	242	uint32_t sectors_per_block;
f9a8e0cd	243	int sectors_per_block_shift;
991d9fa0	244
67e2e2b2	245	struct pool_features pf;
88a6621b	246	bool low_water_triggered:1; /* A dm event has been sent */
80e96c54	247	bool suspended:1;
c3667cc6	248	bool out_of_data_space:1;
991d9fa0	249
44feb387	250	struct dm_bio_prison *prison;
991d9fa0 JT	251	struct dm_kcopyd_client *copier;
991d9fa0 JT	252
72d711c8	253	struct work_struct worker;
991d9fa0	254	struct workqueue_struct *wq;
7d327fe0	255	struct throttle throttle;
905e51b3	256	struct delayed_work waker;
85ad643b	257	struct delayed_work no_space_timeout;
991d9fa0	258
905e51b3	259	unsigned long last_commit_jiffies;
86a3238c	260	unsigned int ref_count;
991d9fa0 JT	261
991d9fa0 JT	262	spinlock_t lock;
991d9fa0	263	struct bio_list deferred_flush_bios;
4ae280b4	264	struct bio_list deferred_flush_completions;
991d9fa0	265	struct list_head prepared_mappings;
104655fd	266	struct list_head prepared_discards;
2a0fbffb	267	struct list_head prepared_discards_pt2;
c140e1c4	268	struct list_head active_thins;
991d9fa0	269
44feb387 MS	270	struct dm_deferred_set *shared_read_ds;
44feb387 MS	271	struct dm_deferred_set *all_io_ds;
991d9fa0	272
a24c2569	273	struct dm_thin_new_mapping *next_mapping;
e49e5829 JT	274
	275	process_bio_fn process_bio;
	276	process_bio_fn process_discard;
	277
a374bb21 JT	278	process_cell_fn process_cell;
	279	process_cell_fn process_discard_cell;
	280
e49e5829 JT	281	process_mapping_fn process_prepared_mapping;
e49e5829 JT	282	process_mapping_fn process_prepared_discard;
2a0fbffb	283	process_mapping_fn process_prepared_discard_pt2;
ac4c3f34	284
a822c83e	285	struct dm_bio_prison_cell **cell_sort_array;
72d711c8 MS	286
72d711c8 MS	287	mempool_t mapping_pool;
991d9fa0 JT	288	};
991d9fa0 JT	289
b5330655	290	static void metadata_operation_failed(struct pool pool, const char op, int r);
e49e5829	291
f6c36758 MS	292	static enum pool_mode get_pool_mode(struct pool *pool)
	293	{
	294	return pool->pf.mode;
	295	}
	296
	297	static void notify_of_pool_mode_change(struct pool *pool)
	298	{
774f13ac	299	static const char *descs[] = {
f6c36758 MS	300	"write",
	301	"out-of-data-space",
	302	"read-only",
	303	"read-only",
	304	"fail"
	305	};
	306	const char *extra_desc = NULL;
	307	enum pool_mode mode = get_pool_mode(pool);
	308
	309	if (mode == PM_OUT_OF_DATA_SPACE) {
	310	if (!pool->pf.error_if_no_space)
	311	extra_desc = " (queue IO)";
	312	else
	313	extra_desc = " (error IO)";
	314	}
	315
	316	dm_table_event(pool->ti->table);
	317	DMINFO("%s: switching pool to %s%s mode",
	318	dm_device_name(pool->pool_md),
	319	descs[(int)mode], extra_desc ? : "");
	320	}
	321
991d9fa0 JT	322	/*
	323	* Target context for a pool.
	324	*/
	325	struct pool_c {
	326	struct dm_target *ti;
	327	struct pool *pool;
	328	struct dm_dev *data_dev;
	329	struct dm_dev *metadata_dev;
991d9fa0 JT	330
991d9fa0 JT	331	dm_block_t low_water_blocks;
0424caa1 MS	332	struct pool_features requested_pf; /* Features requested during table load */
0424caa1 MS	333	struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
991d9fa0 JT	334	};
	335
	336	/*
	337	* Target context for a thin.
	338	*/
	339	struct thin_c {
c140e1c4	340	struct list_head list;
991d9fa0	341	struct dm_dev *pool_dev;
2dd9c257	342	struct dm_dev *origin_dev;
e5aea7b4	343	sector_t origin_size;
991d9fa0 JT	344	dm_thin_id dev_id;
	345
	346	struct pool *pool;
	347	struct dm_thin_device *td;
583024d2 MS	348	struct mapped_device *thin_md;
583024d2 MS	349
738211f7	350	bool requeue_mode:1;
c140e1c4	351	spinlock_t lock;
a374bb21	352	struct list_head deferred_cells;
c140e1c4 MS	353	struct bio_list deferred_bio_list;
c140e1c4 MS	354	struct bio_list retry_on_resume_list;
67324ea1	355	struct rb_root sort_bio_list; /* sorted list of deferred bios */
b10ebd34 JT	356
	357	/*
	358	* Ensures the thin is not destroyed until the worker has finished
	359	* iterating the active_thins list.
	360	*/
22d4c291	361	refcount_t refcount;
b10ebd34	362	struct completion can_destroy;
991d9fa0 JT	363	};
	364
	365	/----------------------------------------------------------------/
	366
34fbcf62 JT	367	static bool block_size_is_power_of_two(struct pool *pool)
	368	{
	369	return pool->sectors_per_block_shift >= 0;
	370	}
	371
	372	static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
	373	{
	374	return block_size_is_power_of_two(pool) ?
	375	(b << pool->sectors_per_block_shift) :
	376	(b * pool->sectors_per_block);
	377	}
	378
202bae52 JT	379	/----------------------------------------------------------------/
	380
	381	struct discard_op {
	382	struct thin_c *tc;
	383	struct blk_plug plug;
	384	struct bio *parent_bio;
	385	struct bio *bio;
	386	};
	387
	388	static void begin_discard(struct discard_op op, struct thin_c tc, struct bio *parent)
	389	{
	390	BUG_ON(!parent);
	391
	392	op->tc = tc;
	393	blk_start_plug(&op->plug);
	394	op->parent_bio = parent;
	395	op->bio = NULL;
	396	}
	397
	398	static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
34fbcf62	399	{
202bae52	400	struct thin_c *tc = op->tc;
34fbcf62 JT	401	sector_t s = block_to_sectors(tc->pool, data_b);
34fbcf62 JT	402	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
3dba53a9	403
722d9082	404	return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);
202bae52 JT	405	}
	406
	407	static void end_discard(struct discard_op *op, int r)
	408	{
	409	if (op->bio) {
	410	/*
	411	* Even if one of the calls to issue_discard failed, we
	412	* need to wait for the chain to complete.
	413	*/
	414	bio_chain(op->bio, op->parent_bio);
c34b7ac6	415	op->bio->bi_opf = REQ_OP_DISCARD;
4e49ea4a	416	submit_bio(op->bio);
3dba53a9	417	}
34fbcf62	418
202bae52 JT	419	blk_finish_plug(&op->plug);
	420
	421	/*
	422	* Even if r is set, there could be sub discards in flight that we
	423	* need to wait for.
	424	*/
4e4cbee9 CH	425	if (r && !op->parent_bio->bi_status)
4e4cbee9 CH	426	op->parent_bio->bi_status = errno_to_blk_status(r);
202bae52	427	bio_endio(op->parent_bio);
34fbcf62 JT	428	}
	429
	430	/----------------------------------------------------------------/
	431
025b9685 JT	432	/*
	433	* wake_worker() is used when new work is queued and when pool_resume is
	434	* ready to continue deferred IO processing.
	435	*/
	436	static void wake_worker(struct pool *pool)
	437	{
	438	queue_work(pool->wq, &pool->worker);
	439	}
	440
	441	/----------------------------------------------------------------/
	442
6beca5eb JT	443	static int bio_detain(struct pool pool, struct dm_cell_key key, struct bio *bio,
	444	struct dm_bio_prison_cell **cell_result)
	445	{
	446	int r;
	447	struct dm_bio_prison_cell *cell_prealloc;
	448
	449	/*
	450	* Allocate a cell from the prison's mempool.
	451	* This might block but it can't fail.
	452	*/
	453	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
	454
	455	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
47c00dcd	456	if (r) {
6beca5eb JT	457	/*
	458	* We reused an old cell; we can get rid of
	459	* the new one.
	460	*/
	461	dm_bio_prison_free_cell(pool->prison, cell_prealloc);
47c00dcd	462	}
6beca5eb JT	463
	464	return r;
	465	}
	466
	467	static void cell_release(struct pool *pool,
	468	struct dm_bio_prison_cell *cell,
	469	struct bio_list *bios)
	470	{
	471	dm_cell_release(pool->prison, cell, bios);
	472	dm_bio_prison_free_cell(pool->prison, cell);
	473	}
	474
2d759a46 JT	475	static void cell_visit_release(struct pool *pool,
	476	void (fn)(void , struct dm_bio_prison_cell *),
	477	void *context,
	478	struct dm_bio_prison_cell *cell)
	479	{
	480	dm_cell_visit_release(pool->prison, fn, context, cell);
	481	dm_bio_prison_free_cell(pool->prison, cell);
	482	}
	483
6beca5eb JT	484	static void cell_release_no_holder(struct pool *pool,
	485	struct dm_bio_prison_cell *cell,
	486	struct bio_list *bios)
	487	{
	488	dm_cell_release_no_holder(pool->prison, cell, bios);
	489	dm_bio_prison_free_cell(pool->prison, cell);
	490	}
	491
af91805a	492	static void cell_error_with_code(struct pool *pool,
4e4cbee9	493	struct dm_bio_prison_cell *cell, blk_status_t error_code)
6beca5eb	494	{
af91805a	495	dm_cell_error(pool->prison, cell, error_code);
6beca5eb JT	496	dm_bio_prison_free_cell(pool->prison, cell);
	497	}
	498
4e4cbee9	499	static blk_status_t get_pool_io_error_code(struct pool *pool)
c3667cc6	500	{
4e4cbee9	501	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
c3667cc6 MS	502	}
c3667cc6 MS	503
af91805a MS	504	static void cell_error(struct pool pool, struct dm_bio_prison_cell cell)
af91805a MS	505	{
4e4cbee9	506	cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
af91805a MS	507	}
af91805a MS	508
a374bb21 JT	509	static void cell_success(struct pool pool, struct dm_bio_prison_cell cell)
	510	{
	511	cell_error_with_code(pool, cell, 0);
	512	}
	513
	514	static void cell_requeue(struct pool pool, struct dm_bio_prison_cell cell)
	515	{
4e4cbee9	516	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
a374bb21 JT	517	}
a374bb21 JT	518
6beca5eb JT	519	/----------------------------------------------------------------/
6beca5eb JT	520
991d9fa0 JT	521	/*
	522	* A global list of pools that uses a struct mapped_device as a key.
	523	*/
	524	static struct dm_thin_pool_table {
	525	struct mutex mutex;
	526	struct list_head pools;
	527	} dm_thin_pool_table;
	528
	529	static void pool_table_init(void)
	530	{
	531	mutex_init(&dm_thin_pool_table.mutex);
	532	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
	533	}
	534
d5ffebdd MS	535	static void pool_table_exit(void)
	536	{
	537	mutex_destroy(&dm_thin_pool_table.mutex);
	538	}
	539
991d9fa0 JT	540	static void __pool_table_insert(struct pool *pool)
	541	{
	542	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	543	list_add(&pool->list, &dm_thin_pool_table.pools);
	544	}
	545
	546	static void __pool_table_remove(struct pool *pool)
	547	{
	548	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	549	list_del(&pool->list);
	550	}
	551
	552	static struct pool __pool_table_lookup(struct mapped_device md)
	553	{
	554	struct pool pool = NULL, tmp;
	555
	556	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	557
	558	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
	559	if (tmp->pool_md == md) {
	560	pool = tmp;
	561	break;
	562	}
	563	}
	564
	565	return pool;
	566	}
	567
	568	static struct pool __pool_table_lookup_metadata_dev(struct block_device md_dev)
	569	{
	570	struct pool pool = NULL, tmp;
	571
	572	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	573
	574	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
	575	if (tmp->md_dev == md_dev) {
	576	pool = tmp;
	577	break;
	578	}
	579	}
	580
	581	return pool;
	582	}
	583
	584	/----------------------------------------------------------------/
	585
a24c2569	586	struct dm_thin_endio_hook {
eb2aa48d	587	struct thin_c *tc;
44feb387 MS	588	struct dm_deferred_entry *shared_read_entry;
44feb387 MS	589	struct dm_deferred_entry *all_io_entry;
a24c2569	590	struct dm_thin_new_mapping *overwrite_mapping;
67324ea1	591	struct rb_node rb_node;
34fbcf62	592	struct dm_bio_prison_cell *cell;
eb2aa48d JT	593	};
eb2aa48d JT	594
4e4cbee9	595	static void error_bio_list(struct bio_list *bios, blk_status_t error)
991d9fa0 JT	596	{
991d9fa0 JT	597	struct bio *bio;
42d6a8ce	598
4246a0b6	599	while ((bio = bio_list_pop(bios))) {
4e4cbee9	600	bio->bi_status = error;
4246a0b6 CH	601	bio_endio(bio);
4246a0b6 CH	602	}
42d6a8ce MS	603	}
42d6a8ce MS	604
4e4cbee9 CH	605	static void error_thin_bio_list(struct thin_c tc, struct bio_list master,
4e4cbee9 CH	606	blk_status_t error)
42d6a8ce	607	{
991d9fa0 JT	608	struct bio_list bios;
	609
	610	bio_list_init(&bios);
18adc577	611
8e0c9dac	612	spin_lock_irq(&tc->lock);
50bc2150	613	bio_list_merge_init(&bios, master);
8e0c9dac	614	spin_unlock_irq(&tc->lock);
991d9fa0	615
42d6a8ce	616	error_bio_list(&bios, error);
991d9fa0 JT	617	}
991d9fa0 JT	618
a374bb21 JT	619	static void requeue_deferred_cells(struct thin_c *tc)
	620	{
	621	struct pool *pool = tc->pool;
a374bb21 JT	622	struct list_head cells;
	623	struct dm_bio_prison_cell cell, tmp;
	624
	625	INIT_LIST_HEAD(&cells);
	626
8e0c9dac	627	spin_lock_irq(&tc->lock);
a374bb21	628	list_splice_init(&tc->deferred_cells, &cells);
8e0c9dac	629	spin_unlock_irq(&tc->lock);
a374bb21 JT	630
	631	list_for_each_entry_safe(cell, tmp, &cells, user_list)
	632	cell_requeue(pool, cell);
	633	}
	634
991d9fa0 JT	635	static void requeue_io(struct thin_c *tc)
991d9fa0 JT	636	{
3e1a0699 JT	637	struct bio_list bios;
	638
	639	bio_list_init(&bios);
	640
8e0c9dac	641	spin_lock_irq(&tc->lock);
50bc2150 CH	642	bio_list_merge_init(&bios, &tc->deferred_bio_list);
50bc2150 CH	643	bio_list_merge_init(&bios, &tc->retry_on_resume_list);
8e0c9dac	644	spin_unlock_irq(&tc->lock);
3e1a0699	645
4e4cbee9	646	error_bio_list(&bios, BLK_STS_DM_REQUEUE);
42d6a8ce	647	requeue_deferred_cells(tc);
3e1a0699 JT	648	}
3e1a0699 JT	649
4e4cbee9	650	static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
c140e1c4 MS	651	{
	652	struct thin_c *tc;
	653
	654	rcu_read_lock();
	655	list_for_each_entry_rcu(tc, &pool->active_thins, list)
0a927c2f	656	error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
c140e1c4 MS	657	rcu_read_unlock();
	658	}
	659
0a927c2f MS	660	static void error_retry_list(struct pool *pool)
0a927c2f MS	661	{
4e4cbee9	662	error_retry_list_with_code(pool, get_pool_io_error_code(pool));
0a927c2f MS	663	}
0a927c2f MS	664
991d9fa0 JT	665	/*
	666	* This section of code contains the logic for processing a thin device's IO.
	667	* Much of the code depends on pool object resources (lists, workqueues, etc)
	668	* but most is exclusively called from the thin target rather than the thin-pool
	669	* target.
	670	*/
	671
	672	static dm_block_t get_bio_block(struct thin_c tc, struct bio bio)
	673	{
58f77a21	674	struct pool *pool = tc->pool;
4f024f37	675	sector_t block_nr = bio->bi_iter.bi_sector;
55f2b8bd	676
58f77a21 MS	677	if (block_size_is_power_of_two(pool))
58f77a21 MS	678	block_nr >>= pool->sectors_per_block_shift;
f9a8e0cd	679	else
58f77a21	680	(void) sector_div(block_nr, pool->sectors_per_block);
55f2b8bd MS	681
55f2b8bd MS	682	return block_nr;
991d9fa0 JT	683	}
991d9fa0 JT	684
34fbcf62 JT	685	/*
	686	* Returns the _complete_ blocks that this bio covers.
	687	*/
	688	static void get_bio_block_range(struct thin_c tc, struct bio bio,
	689	dm_block_t begin, dm_block_t end)
	690	{
	691	struct pool *pool = tc->pool;
	692	sector_t b = bio->bi_iter.bi_sector;
	693	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
	694
	695	b += pool->sectors_per_block - 1ull; /* so we round up */
	696
	697	if (block_size_is_power_of_two(pool)) {
	698	b >>= pool->sectors_per_block_shift;
	699	e >>= pool->sectors_per_block_shift;
	700	} else {
	701	(void) sector_div(b, pool->sectors_per_block);
	702	(void) sector_div(e, pool->sectors_per_block);
	703	}
	704
47c00dcd	705	if (e < b) {
34fbcf62 JT	706	/* Can happen if the bio is within a single block. */
34fbcf62 JT	707	e = b;
47c00dcd	708	}
34fbcf62 JT	709
	710	*begin = b;
	711	*end = e;
	712	}
	713
991d9fa0 JT	714	static void remap(struct thin_c tc, struct bio bio, dm_block_t block)
	715	{
	716	struct pool *pool = tc->pool;
4f024f37	717	sector_t bi_sector = bio->bi_iter.bi_sector;
991d9fa0	718
74d46992	719	bio_set_dev(bio, tc->pool_dev->bdev);
47c00dcd	720	if (block_size_is_power_of_two(pool)) {
4f024f37 KO	721	bio->bi_iter.bi_sector =
	722	(block << pool->sectors_per_block_shift) \|
	723	(bi_sector & (pool->sectors_per_block - 1));
47c00dcd	724	} else {
4f024f37	725	bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
58f77a21	726	sector_div(bi_sector, pool->sectors_per_block);
47c00dcd	727	}
991d9fa0 JT	728	}
991d9fa0 JT	729
2dd9c257 JT	730	static void remap_to_origin(struct thin_c tc, struct bio bio)
2dd9c257 JT	731	{
74d46992	732	bio_set_dev(bio, tc->origin_dev->bdev);
2dd9c257 JT	733	}
2dd9c257 JT	734
4afdd680 JT	735	static int bio_triggers_commit(struct thin_c tc, struct bio bio)
4afdd680 JT	736	{
f73f44eb	737	return op_is_flush(bio->bi_opf) &&
4afdd680 JT	738	dm_thin_changed_this_transaction(tc->td);
	739	}
	740
e8088073 JT	741	static void inc_all_io_entry(struct pool pool, struct bio bio)
	742	{
	743	struct dm_thin_endio_hook *h;
	744
e6047149	745	if (bio_op(bio) == REQ_OP_DISCARD)
e8088073 JT	746	return;
e8088073 JT	747
59c3d2c6	748	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
e8088073 JT	749	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
	750	}
	751
2dd9c257	752	static void issue(struct thin_c tc, struct bio bio)
991d9fa0 JT	753	{
991d9fa0 JT	754	struct pool *pool = tc->pool;
991d9fa0	755
e49e5829	756	if (!bio_triggers_commit(tc, bio)) {
b7f8dff0	757	dm_submit_bio_remap(bio, NULL);
e49e5829 JT	758	return;
	759	}
	760
991d9fa0	761	/*
e49e5829 JT	762	* Complete bio with an error if earlier I/O caused changes to
	763	* the metadata that can't be committed e.g, due to I/O errors
	764	* on the metadata device.
991d9fa0	765	*/
e49e5829 JT	766	if (dm_thin_aborted_changes(tc->td)) {
	767	bio_io_error(bio);
	768	return;
	769	}
	770
	771	/*
	772	* Batch together any bios that trigger commits and then issue a
	773	* single commit for them in process_deferred_bios().
	774	*/
8e0c9dac	775	spin_lock_irq(&pool->lock);
e49e5829	776	bio_list_add(&pool->deferred_flush_bios, bio);
8e0c9dac	777	spin_unlock_irq(&pool->lock);
991d9fa0 JT	778	}
991d9fa0 JT	779
2dd9c257 JT	780	static void remap_to_origin_and_issue(struct thin_c tc, struct bio bio)
	781	{
	782	remap_to_origin(tc, bio);
	783	issue(tc, bio);
	784	}
	785
	786	static void remap_and_issue(struct thin_c tc, struct bio bio,
	787	dm_block_t block)
	788	{
	789	remap(tc, bio, block);
	790	issue(tc, bio);
	791	}
	792
991d9fa0 JT	793	/----------------------------------------------------------------/
	794
	795	/*
	796	* Bio endio functions.
	797	*/
a24c2569	798	struct dm_thin_new_mapping {
991d9fa0 JT	799	struct list_head list;
991d9fa0 JT	800
7f214665	801	bool pass_discard:1;
34fbcf62	802	bool maybe_shared:1;
991d9fa0	803
50f3c3ef JT	804	/*
	805	* Track quiescing, copying and zeroing preparation actions. When this
	806	* counter hits zero the block is prepared and can be inserted into the
	807	* btree.
	808	*/
	809	atomic_t prepare_actions;
	810
4e4cbee9	811	blk_status_t status;
991d9fa0	812	struct thin_c *tc;
34fbcf62	813	dm_block_t virt_begin, virt_end;
991d9fa0	814	dm_block_t data_block;
34fbcf62	815	struct dm_bio_prison_cell *cell;
991d9fa0 JT	816
	817	/*
	818	* If the bio covers the whole area of a block then we can avoid
	819	* zeroing or copying. Instead this bio is hooked. The bio will
	820	* still be in the cell, so care has to be taken to avoid issuing
	821	* the bio twice.
	822	*/
	823	struct bio *bio;
	824	bio_end_io_t *saved_bi_end_io;
	825	};
	826
50f3c3ef	827	static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
991d9fa0 JT	828	{
	829	struct pool *pool = m->tc->pool;
	830
50f3c3ef	831	if (atomic_dec_and_test(&m->prepare_actions)) {
daec338b	832	list_add_tail(&m->list, &pool->prepared_mappings);
991d9fa0 JT	833	wake_worker(pool);
	834	}
	835	}
	836
e5aea7b4	837	static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
991d9fa0 JT	838	{
991d9fa0 JT	839	unsigned long flags;
991d9fa0 JT	840	struct pool *pool = m->tc->pool;
991d9fa0 JT	841
991d9fa0	842	spin_lock_irqsave(&pool->lock, flags);
50f3c3ef	843	__complete_mapping_preparation(m);
991d9fa0 JT	844	spin_unlock_irqrestore(&pool->lock, flags);
	845	}
	846
e5aea7b4 JT	847	static void copy_complete(int read_err, unsigned long write_err, void *context)
	848	{
	849	struct dm_thin_new_mapping *m = context;
	850
4e4cbee9	851	m->status = read_err \|\| write_err ? BLK_STS_IOERR : 0;
e5aea7b4 JT	852	complete_mapping_preparation(m);
	853	}
	854
4246a0b6	855	static void overwrite_endio(struct bio *bio)
991d9fa0	856	{
59c3d2c6	857	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569	858	struct dm_thin_new_mapping *m = h->overwrite_mapping;
991d9fa0	859
8b908f8e MS	860	bio->bi_end_io = m->saved_bi_end_io;
8b908f8e MS	861
4e4cbee9	862	m->status = bio->bi_status;
e5aea7b4	863	complete_mapping_preparation(m);
991d9fa0 JT	864	}
991d9fa0 JT	865
991d9fa0 JT	866	/----------------------------------------------------------------/
	867
	868	/*
	869	* Workqueue.
	870	*/
	871
	872	/*
	873	* Prepared mapping jobs.
	874	*/
	875
	876	/*
2d759a46 JT	877	* This sends the bios in the cell, except the original holder, back
2d759a46 JT	878	* to the deferred_bios list.
991d9fa0	879	*/
f286ba0e	880	static void cell_defer_no_holder(struct thin_c tc, struct dm_bio_prison_cell cell)
991d9fa0	881	{
991d9fa0 JT	882	struct pool *pool = tc->pool;
991d9fa0 JT	883	unsigned long flags;
bb46c561	884	struct bio_list bios;
991d9fa0	885
bb46c561 JT	886	bio_list_init(&bios);
bb46c561 JT	887	cell_release_no_holder(pool, cell, &bios);
991d9fa0	888
bb46c561 JT	889	if (!bio_list_empty(&bios)) {
	890	spin_lock_irqsave(&tc->lock, flags);
	891	bio_list_merge(&tc->deferred_bio_list, &bios);
	892	spin_unlock_irqrestore(&tc->lock, flags);
d256d796	893	wake_worker(pool);
bb46c561	894	}
991d9fa0 JT	895	}
991d9fa0 JT	896
a374bb21 JT	897	static void thin_defer_bio(struct thin_c tc, struct bio bio);
a374bb21 JT	898
2d759a46 JT	899	struct remap_info {
	900	struct thin_c *tc;
	901	struct bio_list defer_bios;
	902	struct bio_list issue_bios;
	903	};
	904
	905	static void __inc_remap_and_issue_cell(void *context,
	906	struct dm_bio_prison_cell *cell)
a374bb21	907	{
2d759a46	908	struct remap_info *info = context;
a374bb21	909	struct bio *bio;
a374bb21	910
2d759a46	911	while ((bio = bio_list_pop(&cell->bios))) {
f73f44eb	912	if (op_is_flush(bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD)
2d759a46	913	bio_list_add(&info->defer_bios, bio);
a374bb21	914	else {
2d759a46 JT	915	inc_all_io_entry(info->tc->pool, bio);
	916
	917	/*
	918	* We can't issue the bios with the bio prison lock
	919	* held, so we add them to a list to issue on
	920	* return from this function.
	921	*/
	922	bio_list_add(&info->issue_bios, bio);
a374bb21 JT	923	}
	924	}
	925	}
	926
2d759a46 JT	927	static void inc_remap_and_issue_cell(struct thin_c *tc,
	928	struct dm_bio_prison_cell *cell,
	929	dm_block_t block)
	930	{
	931	struct bio *bio;
	932	struct remap_info info;
	933
	934	info.tc = tc;
	935	bio_list_init(&info.defer_bios);
	936	bio_list_init(&info.issue_bios);
	937
	938	/*
	939	* We have to be careful to inc any bios we're about to issue
	940	* before the cell is released, and avoid a race with new bios
	941	* being added to the cell.
	942	*/
	943	cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
	944	&info, cell);
	945
	946	while ((bio = bio_list_pop(&info.defer_bios)))
	947	thin_defer_bio(tc, bio);
	948
	949	while ((bio = bio_list_pop(&info.issue_bios)))
	950	remap_and_issue(info.tc, bio, block);
	951	}
	952
e49e5829 JT	953	static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
e49e5829 JT	954	{
6beca5eb	955	cell_error(m->tc->pool, m->cell);
e49e5829	956	list_del(&m->list);
6f1c819c	957	mempool_free(m, &m->tc->pool->mapping_pool);
e49e5829	958	}
025b9685	959
4ae280b4 NT	960	static void complete_overwrite_bio(struct thin_c tc, struct bio bio)
	961	{
	962	struct pool *pool = tc->pool;
4ae280b4 NT	963
	964	/*
	965	* If the bio has the REQ_FUA flag set we must commit the metadata
	966	* before signaling its completion.
	967	*/
	968	if (!bio_triggers_commit(tc, bio)) {
	969	bio_endio(bio);
	970	return;
	971	}
	972
	973	/*
	974	* Complete bio with an error if earlier I/O caused changes to the
	975	* metadata that can't be committed, e.g, due to I/O errors on the
	976	* metadata device.
	977	*/
	978	if (dm_thin_aborted_changes(tc->td)) {
	979	bio_io_error(bio);
	980	return;
	981	}
	982
	983	/*
	984	* Batch together any bios that trigger commits and then issue a
	985	* single commit for them in process_deferred_bios().
	986	*/
8e0c9dac	987	spin_lock_irq(&pool->lock);
4ae280b4	988	bio_list_add(&pool->deferred_flush_completions, bio);
8e0c9dac	989	spin_unlock_irq(&pool->lock);
4ae280b4 NT	990	}
4ae280b4 NT	991
a24c2569	992	static void process_prepared_mapping(struct dm_thin_new_mapping *m)
991d9fa0 JT	993	{
991d9fa0 JT	994	struct thin_c *tc = m->tc;
6beca5eb	995	struct pool *pool = tc->pool;
8b908f8e	996	struct bio *bio = m->bio;
991d9fa0 JT	997	int r;
991d9fa0 JT	998
4e4cbee9	999	if (m->status) {
6beca5eb	1000	cell_error(pool, m->cell);
905386f8	1001	goto out;
991d9fa0 JT	1002	}
	1003
	1004	/*
	1005	* Commit the prepared block into the mapping btree.
	1006	* Any I/O for this block arriving after this point will get
	1007	* remapped to it directly.
	1008	*/
34fbcf62	1009	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
991d9fa0	1010	if (r) {
b5330655	1011	metadata_operation_failed(pool, "dm_thin_insert_block", r);
6beca5eb	1012	cell_error(pool, m->cell);
905386f8	1013	goto out;
991d9fa0 JT	1014	}
	1015
	1016	/*
	1017	* Release any bios held while the block was being provisioned.
	1018	* If we are processing a write bio that completely covers the block,
	1019	* we already processed it so can ignore it now when processing
	1020	* the bios in the cell.
	1021	*/
	1022	if (bio) {
2d759a46	1023	inc_remap_and_issue_cell(tc, m->cell, m->data_block);
4ae280b4	1024	complete_overwrite_bio(tc, bio);
2d759a46 JT	1025	} else {
	1026	inc_all_io_entry(tc->pool, m->cell->holder);
	1027	remap_and_issue(tc, m->cell->holder, m->data_block);
	1028	inc_remap_and_issue_cell(tc, m->cell, m->data_block);
	1029	}
991d9fa0	1030
905386f8	1031	out:
991d9fa0	1032	list_del(&m->list);
6f1c819c	1033	mempool_free(m, &pool->mapping_pool);
991d9fa0 JT	1034	}
991d9fa0 JT	1035
34fbcf62 JT	1036	/----------------------------------------------------------------/
	1037
	1038	static void free_discard_mapping(struct dm_thin_new_mapping *m)
104655fd	1039	{
104655fd	1040	struct thin_c *tc = m->tc;
0ef0b471	1041
34fbcf62 JT	1042	if (m->cell)
34fbcf62 JT	1043	cell_defer_no_holder(tc, m->cell);
6f1c819c	1044	mempool_free(m, &tc->pool->mapping_pool);
34fbcf62	1045	}
104655fd	1046
34fbcf62 JT	1047	static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
34fbcf62 JT	1048	{
e49e5829	1049	bio_io_error(m->bio);
34fbcf62 JT	1050	free_discard_mapping(m);
	1051	}
	1052
	1053	static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
	1054	{
4246a0b6	1055	bio_endio(m->bio);
34fbcf62 JT	1056	free_discard_mapping(m);
	1057	}
	1058
	1059	static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
	1060	{
	1061	int r;
	1062	struct thin_c *tc = m->tc;
	1063
	1064	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
	1065	if (r) {
	1066	metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
	1067	bio_io_error(m->bio);
	1068	} else
4246a0b6	1069	bio_endio(m->bio);
34fbcf62	1070
f286ba0e	1071	cell_defer_no_holder(tc, m->cell);
6f1c819c	1072	mempool_free(m, &tc->pool->mapping_pool);
e49e5829 JT	1073	}
e49e5829 JT	1074
202bae52 JT	1075	/----------------------------------------------------------------/
202bae52 JT	1076
2a0fbffb JT	1077	static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
2a0fbffb JT	1078	struct bio *discard_parent)
e49e5829	1079	{
34fbcf62 JT	1080	/*
	1081	* We've already unmapped this range of blocks, but before we
	1082	* passdown we have to check that these blocks are now unused.
	1083	*/
202bae52	1084	int r = 0;
d445bd9c	1085	bool shared = true;
e49e5829	1086	struct thin_c *tc = m->tc;
34fbcf62 JT	1087	struct pool *pool = tc->pool;
34fbcf62 JT	1088	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
202bae52	1089	struct discard_op op;
104655fd	1090
2a0fbffb	1091	begin_discard(&op, tc, discard_parent);
34fbcf62 JT	1092	while (b != end) {
	1093	/* find start of unmapped run */
	1094	for (; b < end; b++) {
d445bd9c	1095	r = dm_pool_block_is_shared(pool->pmd, b, &shared);
34fbcf62	1096	if (r)
202bae52	1097	goto out;
e8088073	1098
d445bd9c	1099	if (!shared)
34fbcf62	1100	break;
19fa1a67	1101	}
104655fd	1102
34fbcf62 JT	1103	if (b == end)
	1104	break;
	1105
	1106	/* find end of run */
	1107	for (e = b + 1; e != end; e++) {
d445bd9c	1108	r = dm_pool_block_is_shared(pool->pmd, e, &shared);
34fbcf62	1109	if (r)
202bae52	1110	goto out;
34fbcf62	1111
d445bd9c	1112	if (shared)
34fbcf62 JT	1113	break;
	1114	}
	1115
202bae52	1116	r = issue_discard(&op, b, e);
34fbcf62	1117	if (r)
202bae52	1118	goto out;
34fbcf62 JT	1119
	1120	b = e;
	1121	}
202bae52 JT	1122	out:
202bae52 JT	1123	end_discard(&op, r);
104655fd JT	1124	}
104655fd JT	1125
2a0fbffb JT	1126	static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
	1127	{
	1128	unsigned long flags;
	1129	struct pool *pool = m->tc->pool;
	1130
	1131	spin_lock_irqsave(&pool->lock, flags);
	1132	list_add_tail(&m->list, &pool->prepared_discards_pt2);
	1133	spin_unlock_irqrestore(&pool->lock, flags);
	1134	wake_worker(pool);
	1135	}
	1136
	1137	static void passdown_endio(struct bio *bio)
	1138	{
	1139	/*
	1140	* It doesn't matter if the passdown discard failed, we still want
	1141	* to unmap (we ignore err).
	1142	*/
	1143	queue_passdown_pt2(bio->bi_private);
948f581a	1144	bio_put(bio);
2a0fbffb JT	1145	}
	1146
	1147	static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
e49e5829 JT	1148	{
	1149	int r;
	1150	struct thin_c *tc = m->tc;
34fbcf62	1151	struct pool *pool = tc->pool;
2a0fbffb JT	1152	struct bio *discard_parent;
2a0fbffb JT	1153	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
e49e5829	1154
2a0fbffb JT	1155	/*
	1156	* Only this thread allocates blocks, so we can be sure that the
	1157	* newly unmapped blocks will not be allocated before the end of
	1158	* the function.
	1159	*/
34fbcf62	1160	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
202bae52	1161	if (r) {
34fbcf62	1162	metadata_operation_failed(pool, "dm_thin_remove_range", r);
202bae52	1163	bio_io_error(m->bio);
2a0fbffb	1164	cell_defer_no_holder(tc, m->cell);
6f1c819c	1165	mempool_free(m, &pool->mapping_pool);
2a0fbffb JT	1166	return;
2a0fbffb JT	1167	}
34fbcf62	1168
00a0ea33 VV	1169	/*
	1170	* Increment the unmapped blocks. This prevents a race between the
	1171	* passdown io and reallocation of freed blocks.
	1172	*/
	1173	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
	1174	if (r) {
	1175	metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
	1176	bio_io_error(m->bio);
	1177	cell_defer_no_holder(tc, m->cell);
6f1c819c	1178	mempool_free(m, &pool->mapping_pool);
00a0ea33 VV	1179	return;
	1180	}
	1181
07888c66	1182	discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO);
53db984e CH	1183	discard_parent->bi_end_io = passdown_endio;
53db984e CH	1184	discard_parent->bi_private = m;
255e2646 HM	1185	if (m->maybe_shared)
	1186	passdown_double_checking_shared_status(m, discard_parent);
	1187	else {
53db984e	1188	struct discard_op op;
2a0fbffb	1189
53db984e CH	1190	begin_discard(&op, tc, discard_parent);
	1191	r = issue_discard(&op, m->data_block, data_end);
	1192	end_discard(&op, r);
202bae52	1193	}
2a0fbffb JT	1194	}
	1195
	1196	static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
	1197	{
	1198	int r;
	1199	struct thin_c *tc = m->tc;
	1200	struct pool *pool = tc->pool;
	1201
	1202	/*
	1203	* The passdown has completed, so now we can decrement all those
	1204	* unmapped blocks.
	1205	*/
	1206	r = dm_pool_dec_data_range(pool->pmd, m->data_block,
	1207	m->data_block + (m->virt_end - m->virt_begin));
	1208	if (r) {
	1209	metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
	1210	bio_io_error(m->bio);
	1211	} else
	1212	bio_endio(m->bio);
	1213
34fbcf62	1214	cell_defer_no_holder(tc, m->cell);
6f1c819c	1215	mempool_free(m, &pool->mapping_pool);
e49e5829 JT	1216	}
e49e5829 JT	1217
104655fd	1218	static void process_prepared(struct pool pool, struct list_head head,
e49e5829	1219	process_mapping_fn *fn)
991d9fa0	1220	{
991d9fa0	1221	struct list_head maps;
a24c2569	1222	struct dm_thin_new_mapping m, tmp;
991d9fa0 JT	1223
991d9fa0 JT	1224	INIT_LIST_HEAD(&maps);
8e0c9dac	1225	spin_lock_irq(&pool->lock);
104655fd	1226	list_splice_init(head, &maps);
8e0c9dac	1227	spin_unlock_irq(&pool->lock);
991d9fa0 JT	1228
991d9fa0 JT	1229	list_for_each_entry_safe(m, tmp, &maps, list)
e49e5829	1230	(*fn)(m);
991d9fa0 JT	1231	}
	1232
	1233	/*
	1234	* Deferred bio jobs.
	1235	*/
104655fd	1236	static int io_overlaps_block(struct pool pool, struct bio bio)
991d9fa0	1237	{
4f024f37 KO	1238	return bio->bi_iter.bi_size ==
4f024f37 KO	1239	(pool->sectors_per_block << SECTOR_SHIFT);
104655fd JT	1240	}
	1241
	1242	static int io_overwrites_block(struct pool pool, struct bio bio)
	1243	{
	1244	return (bio_data_dir(bio) == WRITE) &&
	1245	io_overlaps_block(pool, bio);
991d9fa0 JT	1246	}
	1247
	1248	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
	1249	bio_end_io_t *fn)
	1250	{
	1251	*save = bio->bi_end_io;
	1252	bio->bi_end_io = fn;
	1253	}
	1254
	1255	static int ensure_next_mapping(struct pool *pool)
	1256	{
	1257	if (pool->next_mapping)
	1258	return 0;
	1259
6f1c819c	1260	pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
991d9fa0 JT	1261
	1262	return pool->next_mapping ? 0 : -ENOMEM;
	1263	}
	1264
a24c2569	1265	static struct dm_thin_new_mapping get_next_mapping(struct pool pool)
991d9fa0	1266	{
16961b04	1267	struct dm_thin_new_mapping *m = pool->next_mapping;
991d9fa0 JT	1268
	1269	BUG_ON(!pool->next_mapping);
	1270
16961b04 MS	1271	memset(m, 0, sizeof(struct dm_thin_new_mapping));
	1272	INIT_LIST_HEAD(&m->list);
	1273	m->bio = NULL;
	1274
991d9fa0 JT	1275	pool->next_mapping = NULL;
991d9fa0 JT	1276
16961b04	1277	return m;
991d9fa0 JT	1278	}
991d9fa0 JT	1279
e5aea7b4 JT	1280	static void ll_zero(struct thin_c tc, struct dm_thin_new_mapping m,
	1281	sector_t begin, sector_t end)
	1282	{
e5aea7b4 JT	1283	struct dm_io_region to;
	1284
	1285	to.bdev = tc->pool_dev->bdev;
	1286	to.sector = begin;
	1287	to.count = end - begin;
	1288
7209049d	1289	dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
e5aea7b4 JT	1290	}
e5aea7b4 JT	1291
452d7a62	1292	static void remap_and_issue_overwrite(struct thin_c tc, struct bio bio,
34fbcf62	1293	dm_block_t data_begin,
452d7a62 MS	1294	struct dm_thin_new_mapping *m)
	1295	{
	1296	struct pool *pool = tc->pool;
	1297	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
	1298
	1299	h->overwrite_mapping = m;
	1300	m->bio = bio;
	1301	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
	1302	inc_all_io_entry(pool, bio);
34fbcf62	1303	remap_and_issue(tc, bio, data_begin);
452d7a62 MS	1304	}
452d7a62 MS	1305
e5aea7b4 JT	1306	/*
	1307	* A partial copy also needs to zero the uncopied region.
	1308	*/
991d9fa0	1309	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
2dd9c257 JT	1310	struct dm_dev *origin, dm_block_t data_origin,
2dd9c257 JT	1311	dm_block_t data_dest,
e5aea7b4 JT	1312	struct dm_bio_prison_cell cell, struct bio bio,
e5aea7b4 JT	1313	sector_t len)
991d9fa0	1314	{
991d9fa0	1315	struct pool *pool = tc->pool;
a24c2569	1316	struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0	1317
991d9fa0	1318	m->tc = tc;
34fbcf62 JT	1319	m->virt_begin = virt_block;
34fbcf62 JT	1320	m->virt_end = virt_block + 1u;
991d9fa0 JT	1321	m->data_block = data_dest;
991d9fa0 JT	1322	m->cell = cell;
991d9fa0	1323
e5aea7b4 JT	1324	/*
	1325	* quiesce action + copy action + an extra reference held for the
	1326	* duration of this function (we may need to inc later for a
	1327	* partial zero).
	1328	*/
	1329	atomic_set(&m->prepare_actions, 3);
	1330
44feb387	1331	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
e5aea7b4	1332	complete_mapping_preparation(m); /* already quiesced */
991d9fa0 JT	1333
	1334	/*
	1335	* IO to pool_dev remaps to the pool target's data_dev.
	1336	*
	1337	* If the whole block of data is being overwritten, we can issue the
	1338	* bio immediately. Otherwise we use kcopyd to clone the data first.
	1339	*/
452d7a62 MS	1340	if (io_overwrites_block(pool, bio))
	1341	remap_and_issue_overwrite(tc, bio, data_dest, m);
	1342	else {
991d9fa0 JT	1343	struct dm_io_region from, to;
991d9fa0 JT	1344
2dd9c257	1345	from.bdev = origin->bdev;
991d9fa0	1346	from.sector = data_origin * pool->sectors_per_block;
e5aea7b4	1347	from.count = len;
991d9fa0 JT	1348
	1349	to.bdev = tc->pool_dev->bdev;
	1350	to.sector = data_dest * pool->sectors_per_block;
e5aea7b4	1351	to.count = len;
991d9fa0	1352
7209049d MS	1353	dm_kcopyd_copy(pool->copier, &from, 1, &to,
7209049d MS	1354	0, copy_complete, m);
e5aea7b4 JT	1355
	1356	/*
	1357	* Do we need to zero a tail region?
	1358	*/
	1359	if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
	1360	atomic_inc(&m->prepare_actions);
	1361	ll_zero(tc, m,
	1362	data_dest * pool->sectors_per_block + len,
	1363	(data_dest + 1) * pool->sectors_per_block);
991d9fa0 JT	1364	}
991d9fa0 JT	1365	}
e5aea7b4 JT	1366
e5aea7b4 JT	1367	complete_mapping_preparation(m); /* drop our ref */
991d9fa0 JT	1368	}
991d9fa0 JT	1369
2dd9c257 JT	1370	static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
2dd9c257 JT	1371	dm_block_t data_origin, dm_block_t data_dest,
a24c2569	1372	struct dm_bio_prison_cell cell, struct bio bio)
2dd9c257 JT	1373	{
2dd9c257 JT	1374	schedule_copy(tc, virt_block, tc->pool_dev,
e5aea7b4 JT	1375	data_origin, data_dest, cell, bio,
e5aea7b4 JT	1376	tc->pool->sectors_per_block);
2dd9c257 JT	1377	}
2dd9c257 JT	1378
991d9fa0	1379	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
a24c2569	1380	dm_block_t data_block, struct dm_bio_prison_cell *cell,
991d9fa0 JT	1381	struct bio *bio)
	1382	{
	1383	struct pool *pool = tc->pool;
a24c2569	1384	struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0	1385
50f3c3ef	1386	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
991d9fa0	1387	m->tc = tc;
34fbcf62 JT	1388	m->virt_begin = virt_block;
34fbcf62 JT	1389	m->virt_end = virt_block + 1u;
991d9fa0 JT	1390	m->data_block = data_block;
991d9fa0 JT	1391	m->cell = cell;
991d9fa0 JT	1392
	1393	/*
	1394	* If the whole block of data is being overwritten or we are not
	1395	* zeroing pre-existing data, we can issue the bio immediately.
	1396	* Otherwise we use kcopyd to zero the data first.
	1397	*/
f8ae7525 MS	1398	if (pool->pf.zero_new_blocks) {
	1399	if (io_overwrites_block(pool, bio))
	1400	remap_and_issue_overwrite(tc, bio, data_block, m);
47c00dcd	1401	else {
f8ae7525 MS	1402	ll_zero(tc, m, data_block * pool->sectors_per_block,
f8ae7525 MS	1403	(data_block + 1) * pool->sectors_per_block);
47c00dcd	1404	}
f8ae7525	1405	} else
991d9fa0	1406	process_prepared_mapping(m);
e5aea7b4	1407	}
991d9fa0	1408
e5aea7b4 JT	1409	static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
	1410	dm_block_t data_dest,
	1411	struct dm_bio_prison_cell cell, struct bio bio)
	1412	{
	1413	struct pool *pool = tc->pool;
	1414	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
	1415	sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
	1416
47c00dcd	1417	if (virt_block_end <= tc->origin_size) {
e5aea7b4 JT	1418	schedule_copy(tc, virt_block, tc->origin_dev,
	1419	virt_block, data_dest, cell, bio,
	1420	pool->sectors_per_block);
	1421
47c00dcd	1422	} else if (virt_block_begin < tc->origin_size) {
e5aea7b4 JT	1423	schedule_copy(tc, virt_block, tc->origin_dev,
	1424	virt_block, data_dest, cell, bio,
	1425	tc->origin_size - virt_block_begin);
	1426
47c00dcd	1427	} else
e5aea7b4	1428	schedule_zero(tc, virt_block, data_dest, cell, bio);
991d9fa0 JT	1429	}
991d9fa0 JT	1430
2c43fd26 JT	1431	static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
2c43fd26 JT	1432
a685557f MS	1433	static void requeue_bios(struct pool *pool);
a685557f MS	1434
3ab91828 JT	1435	static bool is_read_only_pool_mode(enum pool_mode mode)
	1436	{
	1437	return (mode == PM_OUT_OF_METADATA_SPACE \|\| mode == PM_READ_ONLY);
	1438	}
	1439
	1440	static bool is_read_only(struct pool *pool)
	1441	{
	1442	return is_read_only_pool_mode(get_pool_mode(pool));
	1443	}
	1444
	1445	static void check_for_metadata_space(struct pool *pool)
	1446	{
	1447	int r;
	1448	const char *ooms_reason = NULL;
	1449	dm_block_t nr_free;
	1450
	1451	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
	1452	if (r)
	1453	ooms_reason = "Could not get free metadata blocks";
	1454	else if (!nr_free)
	1455	ooms_reason = "No free metadata blocks";
	1456
	1457	if (ooms_reason && !is_read_only(pool)) {
	1458	DMERR("%s", ooms_reason);
	1459	set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
	1460	}
	1461	}
	1462
	1463	static void check_for_data_space(struct pool *pool)
2c43fd26 JT	1464	{
	1465	int r;
	1466	dm_block_t nr_free;
	1467
	1468	if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
	1469	return;
	1470
	1471	r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
	1472	if (r)
	1473	return;
	1474
a685557f	1475	if (nr_free) {
2c43fd26	1476	set_pool_mode(pool, PM_WRITE);
a685557f MS	1477	requeue_bios(pool);
a685557f MS	1478	}
2c43fd26 JT	1479	}
2c43fd26 JT	1480
e49e5829 JT	1481	/*
	1482	* A non-zero return indicates read_only or fail_io mode.
	1483	* Many callers don't care about the return value.
	1484	*/
020cc3b5	1485	static int commit(struct pool *pool)
e49e5829 JT	1486	{
	1487	int r;
	1488
3ab91828	1489	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
e49e5829 JT	1490	return -EINVAL;
e49e5829 JT	1491
020cc3b5	1492	r = dm_pool_commit_metadata(pool->pmd);
b5330655 JT	1493	if (r)
b5330655 JT	1494	metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
3ab91828 JT	1495	else {
	1496	check_for_metadata_space(pool);
	1497	check_for_data_space(pool);
	1498	}
e49e5829 JT	1499
	1500	return r;
	1501	}
	1502
88a6621b JT	1503	static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
88a6621b JT	1504	{
88a6621b JT	1505	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
	1506	DMWARN("%s: reached low water mark for data device: sending event.",
	1507	dm_device_name(pool->pool_md));
8e0c9dac	1508	spin_lock_irq(&pool->lock);
88a6621b	1509	pool->low_water_triggered = true;
8e0c9dac	1510	spin_unlock_irq(&pool->lock);
88a6621b JT	1511	dm_table_event(pool->ti->table);
	1512	}
	1513	}
	1514
991d9fa0 JT	1515	static int alloc_data_block(struct thin_c tc, dm_block_t result)
	1516	{
	1517	int r;
	1518	dm_block_t free_blocks;
991d9fa0 JT	1519	struct pool *pool = tc->pool;
991d9fa0 JT	1520
3e1a0699	1521	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
8d30abff JT	1522	return -EINVAL;
8d30abff JT	1523
991d9fa0	1524	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
b5330655 JT	1525	if (r) {
b5330655 JT	1526	metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
991d9fa0	1527	return r;
b5330655	1528	}
991d9fa0	1529
88a6621b	1530	check_low_water_mark(pool, free_blocks);
991d9fa0 JT	1531
991d9fa0 JT	1532	if (!free_blocks) {
94563bad MS	1533	/*
	1534	* Try to commit to see if that will free up some
	1535	* more space.
	1536	*/
020cc3b5 JT	1537	r = commit(pool);
	1538	if (r)
	1539	return r;
991d9fa0	1540
94563bad	1541	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
b5330655 JT	1542	if (r) {
b5330655 JT	1543	metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
94563bad	1544	return r;
b5330655	1545	}
991d9fa0	1546
94563bad	1547	if (!free_blocks) {
3e1a0699	1548	set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
94563bad	1549	return -ENOSPC;
991d9fa0 JT	1550	}
	1551	}
	1552
	1553	r = dm_pool_alloc_data_block(pool->pmd, result);
4a02b34e	1554	if (r) {
a685557f MS	1555	if (r == -ENOSPC)
	1556	set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
	1557	else
	1558	metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
991d9fa0	1559	return r;
4a02b34e	1560	}
991d9fa0	1561
3ab91828 JT	1562	r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
	1563	if (r) {
	1564	metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
	1565	return r;
	1566	}
	1567
	1568	if (!free_blocks) {
	1569	/* Let's commit before we use up the metadata reserve. */
	1570	r = commit(pool);
	1571	if (r)
	1572	return r;
	1573	}
	1574
991d9fa0 JT	1575	return 0;
	1576	}
	1577
	1578	/*
	1579	* If we have run out of space, queue bios until the device is
	1580	* resumed, presumably after having been reloaded with more space.
	1581	*/
	1582	static void retry_on_resume(struct bio *bio)
	1583	{
59c3d2c6	1584	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d	1585	struct thin_c *tc = h->tc;
991d9fa0	1586
8e0c9dac	1587	spin_lock_irq(&tc->lock);
c140e1c4	1588	bio_list_add(&tc->retry_on_resume_list, bio);
8e0c9dac	1589	spin_unlock_irq(&tc->lock);
991d9fa0 JT	1590	}
991d9fa0 JT	1591
4e4cbee9	1592	static blk_status_t should_error_unserviceable_bio(struct pool *pool)
8c0f0e8c	1593	{
3e1a0699 JT	1594	enum pool_mode m = get_pool_mode(pool);
	1595
	1596	switch (m) {
	1597	case PM_WRITE:
	1598	/* Shouldn't get here */
	1599	DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
4e4cbee9	1600	return BLK_STS_IOERR;
3e1a0699 JT	1601
3e1a0699 JT	1602	case PM_OUT_OF_DATA_SPACE:
4e4cbee9	1603	return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
3e1a0699	1604
3ab91828	1605	case PM_OUT_OF_METADATA_SPACE:
3e1a0699 JT	1606	case PM_READ_ONLY:
3e1a0699 JT	1607	case PM_FAIL:
4e4cbee9	1608	return BLK_STS_IOERR;
3e1a0699 JT	1609	default:
	1610	/* Shouldn't get here */
	1611	DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
4e4cbee9	1612	return BLK_STS_IOERR;
3e1a0699 JT	1613	}
3e1a0699 JT	1614	}
8c0f0e8c	1615
3e1a0699 JT	1616	static void handle_unserviceable_bio(struct pool pool, struct bio bio)
3e1a0699 JT	1617	{
4e4cbee9	1618	blk_status_t error = should_error_unserviceable_bio(pool);
af91805a	1619
4246a0b6	1620	if (error) {
4e4cbee9	1621	bio->bi_status = error;
4246a0b6 CH	1622	bio_endio(bio);
4246a0b6 CH	1623	} else
6d16202b	1624	retry_on_resume(bio);
8c0f0e8c MS	1625	}
8c0f0e8c MS	1626
399caddf	1627	static void retry_bios_on_resume(struct pool pool, struct dm_bio_prison_cell cell)
991d9fa0 JT	1628	{
	1629	struct bio *bio;
	1630	struct bio_list bios;
4e4cbee9	1631	blk_status_t error;
991d9fa0	1632
af91805a MS	1633	error = should_error_unserviceable_bio(pool);
	1634	if (error) {
	1635	cell_error_with_code(pool, cell, error);
3e1a0699 JT	1636	return;
	1637	}
	1638
991d9fa0	1639	bio_list_init(&bios);
6beca5eb	1640	cell_release(pool, cell, &bios);
991d9fa0	1641
9d094eeb MS	1642	while ((bio = bio_list_pop(&bios)))
9d094eeb MS	1643	retry_on_resume(bio);
991d9fa0 JT	1644	}
991d9fa0 JT	1645
34fbcf62 JT	1646	static void process_discard_cell_no_passdown(struct thin_c *tc,
34fbcf62 JT	1647	struct dm_bio_prison_cell *virt_cell)
104655fd	1648	{
104655fd	1649	struct pool *pool = tc->pool;
34fbcf62	1650	struct dm_thin_new_mapping *m = get_next_mapping(pool);
104655fd	1651
34fbcf62 JT	1652	/*
	1653	* We don't need to lock the data blocks, since there's no
	1654	* passdown. We only lock data blocks for allocation and breaking sharing.
	1655	*/
	1656	m->tc = tc;
	1657	m->virt_begin = virt_cell->key.block_begin;
	1658	m->virt_end = virt_cell->key.block_end;
	1659	m->cell = virt_cell;
	1660	m->bio = virt_cell->holder;
104655fd	1661
34fbcf62 JT	1662	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
	1663	pool->process_prepared_discard(m);
	1664	}
104655fd	1665
34fbcf62 JT	1666	static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
	1667	struct bio *bio)
	1668	{
	1669	struct pool *pool = tc->pool;
	1670
	1671	int r;
	1672	bool maybe_shared;
	1673	struct dm_cell_key data_key;
	1674	struct dm_bio_prison_cell *data_cell;
	1675	struct dm_thin_new_mapping *m;
e2dd8aca JT	1676	dm_block_t virt_begin, virt_end, data_begin, data_end;
e2dd8aca JT	1677	dm_block_t len, next_boundary;
34fbcf62 JT	1678
34fbcf62 JT	1679	while (begin != end) {
34fbcf62 JT	1680	r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
34fbcf62 JT	1681	&data_begin, &maybe_shared);
e2dd8aca	1682	if (r) {
104655fd	1683	/*
34fbcf62 JT	1684	* Silently fail, letting any mappings we've
34fbcf62 JT	1685	* created complete.
104655fd	1686	*/
34fbcf62	1687	break;
104655fd	1688	}
104655fd	1689
e2dd8aca	1690	data_end = data_begin + (virt_end - virt_begin);
104655fd	1691
34fbcf62	1692	/*
e2dd8aca	1693	* Make sure the data region obeys the bio prison restrictions.
34fbcf62	1694	*/
e2dd8aca JT	1695	while (data_begin < data_end) {
	1696	r = ensure_next_mapping(pool);
	1697	if (r)
	1698	return; /* we did our best */
	1699
	1700	next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)
	1701	<< BIO_PRISON_MAX_RANGE_SHIFT;
	1702	len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
	1703
3f8d3f54 MS	1704	/* This key is certainly within range given the above splitting */
3f8d3f54 MS	1705	(void) build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
e2dd8aca JT	1706	if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
	1707	/* contention, we'll give up with this range */
	1708	data_begin += len;
	1709	continue;
	1710	}
	1711
	1712	/*
	1713	* IO may still be going to the destination block. We must
	1714	* quiesce before we can do the removal.
	1715	*/
	1716	m = get_next_mapping(pool);
	1717	m->tc = tc;
	1718	m->maybe_shared = maybe_shared;
	1719	m->virt_begin = virt_begin;
	1720	m->virt_end = virt_begin + len;
	1721	m->data_block = data_begin;
	1722	m->cell = data_cell;
	1723	m->bio = bio;
	1724
	1725	/*
	1726	* The parent bio must not complete before sub discard bios are
	1727	* chained to it (see end_discard's bio_chain)!
	1728	*
	1729	* This per-mapping bi_remaining increment is paired with
	1730	* the implicit decrement that occurs via bio_endio() in
	1731	* end_discard().
	1732	*/
	1733	bio_inc_remaining(bio);
	1734	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
	1735	pool->process_prepared_discard(m);
	1736
	1737	virt_begin += len;
	1738	data_begin += len;
	1739	}
34fbcf62 JT	1740
34fbcf62 JT	1741	begin = virt_end;
104655fd JT	1742	}
	1743	}
	1744
34fbcf62 JT	1745	static void process_discard_cell_passdown(struct thin_c tc, struct dm_bio_prison_cell virt_cell)
	1746	{
	1747	struct bio *bio = virt_cell->holder;
	1748	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
	1749
	1750	/*
	1751	* The virt_cell will only get freed once the origin bio completes.
	1752	* This means it will remain locked while all the individual
	1753	* passdown bios are in flight.
	1754	*/
	1755	h->cell = virt_cell;
	1756	break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
	1757
	1758	/*
	1759	* We complete the bio now, knowing that the bi_remaining field
	1760	* will prevent completion until the sub range discards have
	1761	* completed.
	1762	*/
4246a0b6	1763	bio_endio(bio);
34fbcf62 JT	1764	}
34fbcf62 JT	1765
a374bb21 JT	1766	static void process_discard_bio(struct thin_c tc, struct bio bio)
a374bb21 JT	1767	{
34fbcf62 JT	1768	dm_block_t begin, end;
	1769	struct dm_cell_key virt_key;
	1770	struct dm_bio_prison_cell *virt_cell;
a374bb21	1771
34fbcf62 JT	1772	get_bio_block_range(tc, bio, &begin, &end);
	1773	if (begin == end) {
	1774	/*
	1775	* The discard covers less than a block.
	1776	*/
4246a0b6	1777	bio_endio(bio);
a374bb21	1778	return;
34fbcf62	1779	}
a374bb21	1780
3f8d3f54 MS	1781	if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {
	1782	DMERR_LIMIT("Discard doesn't respect bio prison limits");
	1783	bio_endio(bio);
	1784	return;
	1785	}
	1786
	1787	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) {
34fbcf62 JT	1788	/*
	1789	* Potential starvation issue: We're relying on the
	1790	* fs/application being well behaved, and not trying to
	1791	* send IO to a region at the same time as discarding it.
	1792	* If they do this persistently then it's possible this
	1793	* cell will never be granted.
	1794	*/
	1795	return;
3f8d3f54	1796	}
34fbcf62 JT	1797
34fbcf62 JT	1798	tc->pool->process_discard_cell(tc, virt_cell);
a374bb21 JT	1799	}
a374bb21 JT	1800
991d9fa0	1801	static void break_sharing(struct thin_c tc, struct bio bio, dm_block_t block,
44feb387	1802	struct dm_cell_key *key,
991d9fa0	1803	struct dm_thin_lookup_result *lookup_result,
a24c2569	1804	struct dm_bio_prison_cell *cell)
991d9fa0 JT	1805	{
	1806	int r;
	1807	dm_block_t data_block;
d6fc2042	1808	struct pool *pool = tc->pool;
991d9fa0 JT	1809
	1810	r = alloc_data_block(tc, &data_block);
	1811	switch (r) {
	1812	case 0:
2dd9c257 JT	1813	schedule_internal_copy(tc, block, lookup_result->block,
2dd9c257 JT	1814	data_block, cell, bio);
991d9fa0 JT	1815	break;
	1816
	1817	case -ENOSPC:
399caddf	1818	retry_bios_on_resume(pool, cell);
991d9fa0 JT	1819	break;
	1820
	1821	default:
c397741c MS	1822	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
c397741c MS	1823	__func__, r);
d6fc2042	1824	cell_error(pool, cell);
991d9fa0 JT	1825	break;
	1826	}
	1827	}
	1828
23ca2bb6 JT	1829	static void __remap_and_issue_shared_cell(void *context,
	1830	struct dm_bio_prison_cell *cell)
	1831	{
	1832	struct remap_info *info = context;
	1833	struct bio *bio;
	1834
	1835	while ((bio = bio_list_pop(&cell->bios))) {
f73f44eb CH	1836	if (bio_data_dir(bio) == WRITE \|\| op_is_flush(bio->bi_opf) \|\|
f73f44eb CH	1837	bio_op(bio) == REQ_OP_DISCARD)
23ca2bb6 JT	1838	bio_list_add(&info->defer_bios, bio);
23ca2bb6 JT	1839	else {
bd6d1e0a	1840	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
23ca2bb6 JT	1841
	1842	h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
	1843	inc_all_io_entry(info->tc->pool, bio);
	1844	bio_list_add(&info->issue_bios, bio);
	1845	}
	1846	}
	1847	}
	1848
	1849	static void remap_and_issue_shared_cell(struct thin_c *tc,
	1850	struct dm_bio_prison_cell *cell,
	1851	dm_block_t block)
	1852	{
	1853	struct bio *bio;
	1854	struct remap_info info;
	1855
	1856	info.tc = tc;
	1857	bio_list_init(&info.defer_bios);
	1858	bio_list_init(&info.issue_bios);
	1859
	1860	cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
	1861	&info, cell);
	1862
	1863	while ((bio = bio_list_pop(&info.defer_bios)))
	1864	thin_defer_bio(tc, bio);
	1865
	1866	while ((bio = bio_list_pop(&info.issue_bios)))
	1867	remap_and_issue(tc, bio, block);
	1868	}
	1869
991d9fa0 JT	1870	static void process_shared_bio(struct thin_c tc, struct bio bio,
991d9fa0 JT	1871	dm_block_t block,
23ca2bb6 JT	1872	struct dm_thin_lookup_result *lookup_result,
23ca2bb6 JT	1873	struct dm_bio_prison_cell *virt_cell)
991d9fa0	1874	{
23ca2bb6	1875	struct dm_bio_prison_cell *data_cell;
991d9fa0	1876	struct pool *pool = tc->pool;
44feb387	1877	struct dm_cell_key key;
991d9fa0 JT	1878
	1879	/*
	1880	* If cell is already occupied, then sharing is already in the process
	1881	* of being broken so we have nothing further to do here.
	1882	*/
	1883	build_data_key(tc->td, lookup_result->block, &key);
23ca2bb6 JT	1884	if (bio_detain(pool, &key, bio, &data_cell)) {
23ca2bb6 JT	1885	cell_defer_no_holder(tc, virt_cell);
991d9fa0	1886	return;
23ca2bb6	1887	}
991d9fa0	1888
23ca2bb6 JT	1889	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
	1890	break_sharing(tc, bio, block, &key, lookup_result, data_cell);
	1891	cell_defer_no_holder(tc, virt_cell);
	1892	} else {
59c3d2c6	1893	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
991d9fa0	1894
44feb387	1895	h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
e8088073	1896	inc_all_io_entry(pool, bio);
991d9fa0	1897	remap_and_issue(tc, bio, lookup_result->block);
23ca2bb6 JT	1898
	1899	remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
	1900	remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
991d9fa0 JT	1901	}
	1902	}
	1903
	1904	static void provision_block(struct thin_c tc, struct bio bio, dm_block_t block,
a24c2569	1905	struct dm_bio_prison_cell *cell)
991d9fa0 JT	1906	{
	1907	int r;
	1908	dm_block_t data_block;
6beca5eb	1909	struct pool *pool = tc->pool;
991d9fa0 JT	1910
	1911	/*
	1912	* Remap empty bios (flushes) immediately, without provisioning.
	1913	*/
4f024f37	1914	if (!bio->bi_iter.bi_size) {
6beca5eb	1915	inc_all_io_entry(pool, bio);
f286ba0e	1916	cell_defer_no_holder(tc, cell);
e8088073	1917
991d9fa0 JT	1918	remap_and_issue(tc, bio, 0);
	1919	return;
	1920	}
	1921
	1922	/*
	1923	* Fill read bios with zeroes and complete them immediately.
	1924	*/
	1925	if (bio_data_dir(bio) == READ) {
	1926	zero_fill_bio(bio);
f286ba0e	1927	cell_defer_no_holder(tc, cell);
4246a0b6	1928	bio_endio(bio);
991d9fa0 JT	1929	return;
	1930	}
	1931
	1932	r = alloc_data_block(tc, &data_block);
	1933	switch (r) {
	1934	case 0:
2dd9c257 JT	1935	if (tc->origin_dev)
	1936	schedule_external_copy(tc, block, data_block, cell, bio);
	1937	else
	1938	schedule_zero(tc, block, data_block, cell, bio);
991d9fa0 JT	1939	break;
	1940
	1941	case -ENOSPC:
399caddf	1942	retry_bios_on_resume(pool, cell);
991d9fa0 JT	1943	break;
	1944
	1945	default:
c397741c MS	1946	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
c397741c MS	1947	__func__, r);
6beca5eb	1948	cell_error(pool, cell);
991d9fa0 JT	1949	break;
	1950	}
	1951	}
	1952
a374bb21	1953	static void process_cell(struct thin_c tc, struct dm_bio_prison_cell cell)
991d9fa0 JT	1954	{
991d9fa0 JT	1955	int r;
6beca5eb	1956	struct pool *pool = tc->pool;
a374bb21	1957	struct bio *bio = cell->holder;
991d9fa0	1958	dm_block_t block = get_bio_block(tc, bio);
991d9fa0 JT	1959	struct dm_thin_lookup_result lookup_result;
991d9fa0 JT	1960
a374bb21 JT	1961	if (tc->requeue_mode) {
a374bb21 JT	1962	cell_requeue(pool, cell);
991d9fa0	1963	return;
a374bb21	1964	}
991d9fa0 JT	1965
	1966	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
	1967	switch (r) {
	1968	case 0:
23ca2bb6 JT	1969	if (lookup_result.shared)
	1970	process_shared_bio(tc, bio, block, &lookup_result, cell);
	1971	else {
6beca5eb	1972	inc_all_io_entry(pool, bio);
991d9fa0	1973	remap_and_issue(tc, bio, lookup_result.block);
a374bb21	1974	inc_remap_and_issue_cell(tc, cell, lookup_result.block);
e8088073	1975	}
991d9fa0 JT	1976	break;
	1977
	1978	case -ENODATA:
2dd9c257	1979	if (bio_data_dir(bio) == READ && tc->origin_dev) {
6beca5eb	1980	inc_all_io_entry(pool, bio);
f286ba0e	1981	cell_defer_no_holder(tc, cell);
e8088073	1982
e5aea7b4 JT	1983	if (bio_end_sector(bio) <= tc->origin_size)
	1984	remap_to_origin_and_issue(tc, bio);
	1985
	1986	else if (bio->bi_iter.bi_sector < tc->origin_size) {
	1987	zero_fill_bio(bio);
	1988	bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
	1989	remap_to_origin_and_issue(tc, bio);
	1990
	1991	} else {
	1992	zero_fill_bio(bio);
4246a0b6	1993	bio_endio(bio);
e5aea7b4	1994	}
2dd9c257 JT	1995	} else
2dd9c257 JT	1996	provision_block(tc, bio, block, cell);
991d9fa0 JT	1997	break;
	1998
	1999	default:
c397741c MS	2000	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
c397741c MS	2001	__func__, r);
f286ba0e	2002	cell_defer_no_holder(tc, cell);
991d9fa0 JT	2003	bio_io_error(bio);
	2004	break;
	2005	}
	2006	}
	2007
a374bb21 JT	2008	static void process_bio(struct thin_c tc, struct bio bio)
	2009	{
	2010	struct pool *pool = tc->pool;
	2011	dm_block_t block = get_bio_block(tc, bio);
	2012	struct dm_bio_prison_cell *cell;
	2013	struct dm_cell_key key;
	2014
	2015	/*
	2016	* If cell is already occupied, then the block is already
	2017	* being provisioned so we have nothing further to do here.
	2018	*/
	2019	build_virtual_key(tc->td, block, &key);
	2020	if (bio_detain(pool, &key, bio, &cell))
	2021	return;
	2022
	2023	process_cell(tc, cell);
	2024	}
	2025
	2026	static void __process_bio_read_only(struct thin_c tc, struct bio bio,
	2027	struct dm_bio_prison_cell *cell)
e49e5829 JT	2028	{
	2029	int r;
	2030	int rw = bio_data_dir(bio);
	2031	dm_block_t block = get_bio_block(tc, bio);
	2032	struct dm_thin_lookup_result lookup_result;
	2033
	2034	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
	2035	switch (r) {
	2036	case 0:
a374bb21	2037	if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
8c0f0e8c	2038	handle_unserviceable_bio(tc->pool, bio);
a374bb21 JT	2039	if (cell)
	2040	cell_defer_no_holder(tc, cell);
	2041	} else {
e8088073	2042	inc_all_io_entry(tc->pool, bio);
e49e5829	2043	remap_and_issue(tc, bio, lookup_result.block);
a374bb21 JT	2044	if (cell)
a374bb21 JT	2045	inc_remap_and_issue_cell(tc, cell, lookup_result.block);
e8088073	2046	}
e49e5829 JT	2047	break;
	2048
	2049	case -ENODATA:
a374bb21 JT	2050	if (cell)
a374bb21 JT	2051	cell_defer_no_holder(tc, cell);
e49e5829	2052	if (rw != READ) {
8c0f0e8c	2053	handle_unserviceable_bio(tc->pool, bio);
e49e5829 JT	2054	break;
	2055	}
	2056
	2057	if (tc->origin_dev) {
e8088073	2058	inc_all_io_entry(tc->pool, bio);
e49e5829 JT	2059	remap_to_origin_and_issue(tc, bio);
	2060	break;
	2061	}
	2062
	2063	zero_fill_bio(bio);
4246a0b6	2064	bio_endio(bio);
e49e5829 JT	2065	break;
	2066
	2067	default:
c397741c MS	2068	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
c397741c MS	2069	__func__, r);
a374bb21 JT	2070	if (cell)
a374bb21 JT	2071	cell_defer_no_holder(tc, cell);
e49e5829 JT	2072	bio_io_error(bio);
	2073	break;
	2074	}
	2075	}
	2076
a374bb21 JT	2077	static void process_bio_read_only(struct thin_c tc, struct bio bio)
	2078	{
	2079	__process_bio_read_only(tc, bio, NULL);
	2080	}
	2081
	2082	static void process_cell_read_only(struct thin_c tc, struct dm_bio_prison_cell cell)
	2083	{
	2084	__process_bio_read_only(tc, cell->holder, cell);
	2085	}
	2086
3e1a0699 JT	2087	static void process_bio_success(struct thin_c tc, struct bio bio)
3e1a0699 JT	2088	{
4246a0b6	2089	bio_endio(bio);
3e1a0699 JT	2090	}
3e1a0699 JT	2091
e49e5829 JT	2092	static void process_bio_fail(struct thin_c tc, struct bio bio)
	2093	{
	2094	bio_io_error(bio);
	2095	}
	2096
a374bb21 JT	2097	static void process_cell_success(struct thin_c tc, struct dm_bio_prison_cell cell)
	2098	{
	2099	cell_success(tc->pool, cell);
	2100	}
	2101
	2102	static void process_cell_fail(struct thin_c tc, struct dm_bio_prison_cell cell)
	2103	{
	2104	cell_error(tc->pool, cell);
	2105	}
	2106
ac8c3f3d JT	2107	/*
	2108	* FIXME: should we also commit due to size of transaction, measured in
	2109	* metadata blocks?
	2110	*/
905e51b3 JT	2111	static int need_commit_due_to_time(struct pool *pool)
905e51b3 JT	2112	{
0f30af98 MS	2113	return !time_in_range(jiffies, pool->last_commit_jiffies,
0f30af98 MS	2114	pool->last_commit_jiffies + COMMIT_PERIOD);
905e51b3 JT	2115	}
905e51b3 JT	2116
67324ea1 MS	2117	#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
	2118	#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
	2119
	2120	static void __thin_bio_rb_add(struct thin_c tc, struct bio bio)
	2121	{
	2122	struct rb_node *rbp, parent;
	2123	struct dm_thin_endio_hook *pbd;
	2124	sector_t bi_sector = bio->bi_iter.bi_sector;
	2125
	2126	rbp = &tc->sort_bio_list.rb_node;
	2127	parent = NULL;
	2128	while (*rbp) {
	2129	parent = *rbp;
	2130	pbd = thin_pbd(parent);
	2131
	2132	if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
	2133	rbp = &(*rbp)->rb_left;
	2134	else
	2135	rbp = &(*rbp)->rb_right;
	2136	}
	2137
	2138	pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
	2139	rb_link_node(&pbd->rb_node, parent, rbp);
	2140	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
	2141	}
	2142
	2143	static void __extract_sorted_bios(struct thin_c *tc)
	2144	{
	2145	struct rb_node *node;
	2146	struct dm_thin_endio_hook *pbd;
	2147	struct bio *bio;
	2148
	2149	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
	2150	pbd = thin_pbd(node);
	2151	bio = thin_bio(pbd);
	2152
	2153	bio_list_add(&tc->deferred_bio_list, bio);
	2154	rb_erase(&pbd->rb_node, &tc->sort_bio_list);
	2155	}
	2156
	2157	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
	2158	}
	2159
	2160	static void __sort_thin_deferred_bios(struct thin_c *tc)
	2161	{
	2162	struct bio *bio;
	2163	struct bio_list bios;
	2164
	2165	bio_list_init(&bios);
	2166	bio_list_merge(&bios, &tc->deferred_bio_list);
	2167	bio_list_init(&tc->deferred_bio_list);
	2168
	2169	/* Sort deferred_bio_list using rb-tree */
	2170	while ((bio = bio_list_pop(&bios)))
	2171	__thin_bio_rb_add(tc, bio);
	2172
	2173	/*
	2174	* Transfer the sorted bios in sort_bio_list back to
	2175	* deferred_bio_list to allow lockless submission of
	2176	* all bios.
	2177	*/
	2178	__extract_sorted_bios(tc);
	2179	}
	2180
c140e1c4	2181	static void process_thin_deferred_bios(struct thin_c *tc)
991d9fa0	2182	{
c140e1c4	2183	struct pool *pool = tc->pool;
991d9fa0 JT	2184	struct bio *bio;
991d9fa0 JT	2185	struct bio_list bios;
67324ea1	2186	struct blk_plug plug;
86a3238c	2187	unsigned int count = 0;
991d9fa0	2188
c140e1c4	2189	if (tc->requeue_mode) {
4e4cbee9 CH	2190	error_thin_bio_list(tc, &tc->deferred_bio_list,
4e4cbee9 CH	2191	BLK_STS_DM_REQUEUE);
c140e1c4 MS	2192	return;
	2193	}
	2194
991d9fa0 JT	2195	bio_list_init(&bios);
991d9fa0 JT	2196
8e0c9dac	2197	spin_lock_irq(&tc->lock);
67324ea1 MS	2198
67324ea1 MS	2199	if (bio_list_empty(&tc->deferred_bio_list)) {
8e0c9dac	2200	spin_unlock_irq(&tc->lock);
67324ea1 MS	2201	return;
	2202	}
	2203
	2204	__sort_thin_deferred_bios(tc);
	2205
c140e1c4 MS	2206	bio_list_merge(&bios, &tc->deferred_bio_list);
c140e1c4 MS	2207	bio_list_init(&tc->deferred_bio_list);
67324ea1	2208
8e0c9dac	2209	spin_unlock_irq(&tc->lock);
991d9fa0	2210
67324ea1	2211	blk_start_plug(&plug);
991d9fa0	2212	while ((bio = bio_list_pop(&bios))) {
991d9fa0 JT	2213	/*
	2214	* If we've got no free new_mapping structs, and processing
	2215	* this bio might require one, we pause until there are some
	2216	* prepared mappings to process.
	2217	*/
	2218	if (ensure_next_mapping(pool)) {
8e0c9dac	2219	spin_lock_irq(&tc->lock);
c140e1c4 MS	2220	bio_list_add(&tc->deferred_bio_list, bio);
c140e1c4 MS	2221	bio_list_merge(&tc->deferred_bio_list, &bios);
8e0c9dac	2222	spin_unlock_irq(&tc->lock);
991d9fa0 JT	2223	break;
991d9fa0 JT	2224	}
104655fd	2225
e6047149	2226	if (bio_op(bio) == REQ_OP_DISCARD)
e49e5829	2227	pool->process_discard(tc, bio);
104655fd	2228	else
e49e5829	2229	pool->process_bio(tc, bio);
8a01a6af JT	2230
8a01a6af JT	2231	if ((count++ & 127) == 0) {
7d327fe0	2232	throttle_work_update(&pool->throttle);
8a01a6af JT	2233	dm_pool_issue_prefetches(pool->pmd);
8a01a6af JT	2234	}
e4f80303	2235	cond_resched();
991d9fa0	2236	}
67324ea1	2237	blk_finish_plug(&plug);
c140e1c4 MS	2238	}
c140e1c4 MS	2239
ac4c3f34 JT	2240	static int cmp_cells(const void lhs, const void rhs)
	2241	{
	2242	struct dm_bio_prison_cell lhs_cell = ((struct dm_bio_prison_cell **) lhs);
	2243	struct dm_bio_prison_cell rhs_cell = ((struct dm_bio_prison_cell **) rhs);
	2244
	2245	BUG_ON(!lhs_cell->holder);
	2246	BUG_ON(!rhs_cell->holder);
	2247
	2248	if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
	2249	return -1;
	2250
	2251	if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
	2252	return 1;
	2253
	2254	return 0;
	2255	}
	2256
86a3238c	2257	static unsigned int sort_cells(struct pool pool, struct list_head cells)
ac4c3f34	2258	{
86a3238c	2259	unsigned int count = 0;
ac4c3f34 JT	2260	struct dm_bio_prison_cell cell, tmp;
	2261
	2262	list_for_each_entry_safe(cell, tmp, cells, user_list) {
	2263	if (count >= CELL_SORT_ARRAY_SIZE)
	2264	break;
	2265
	2266	pool->cell_sort_array[count++] = cell;
	2267	list_del(&cell->user_list);
	2268	}
	2269
	2270	sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
	2271
	2272	return count;
	2273	}
	2274
a374bb21 JT	2275	static void process_thin_deferred_cells(struct thin_c *tc)
	2276	{
	2277	struct pool *pool = tc->pool;
a374bb21	2278	struct list_head cells;
ac4c3f34	2279	struct dm_bio_prison_cell *cell;
86a3238c	2280	unsigned int i, j, count;
a374bb21 JT	2281
	2282	INIT_LIST_HEAD(&cells);
	2283
8e0c9dac	2284	spin_lock_irq(&tc->lock);
a374bb21	2285	list_splice_init(&tc->deferred_cells, &cells);
8e0c9dac	2286	spin_unlock_irq(&tc->lock);
a374bb21 JT	2287
	2288	if (list_empty(&cells))
	2289	return;
	2290
ac4c3f34 JT	2291	do {
ac4c3f34 JT	2292	count = sort_cells(tc->pool, &cells);
a374bb21	2293
ac4c3f34 JT	2294	for (i = 0; i < count; i++) {
	2295	cell = pool->cell_sort_array[i];
	2296	BUG_ON(!cell->holder);
a374bb21	2297
ac4c3f34 JT	2298	/*
	2299	* If we've got no free new_mapping structs, and processing
	2300	* this bio might require one, we pause until there are some
	2301	* prepared mappings to process.
	2302	*/
	2303	if (ensure_next_mapping(pool)) {
	2304	for (j = i; j < count; j++)
	2305	list_add(&pool->cell_sort_array[j]->user_list, &cells);
	2306
8e0c9dac	2307	spin_lock_irq(&tc->lock);
ac4c3f34	2308	list_splice(&cells, &tc->deferred_cells);
8e0c9dac	2309	spin_unlock_irq(&tc->lock);
ac4c3f34 JT	2310	return;
	2311	}
	2312
e6047149	2313	if (bio_op(cell->holder) == REQ_OP_DISCARD)
ac4c3f34 JT	2314	pool->process_discard_cell(tc, cell);
	2315	else
	2316	pool->process_cell(tc, cell);
	2317	}
e4f80303	2318	cond_resched();
ac4c3f34	2319	} while (!list_empty(&cells));
a374bb21 JT	2320	}
a374bb21 JT	2321
b10ebd34 JT	2322	static void thin_get(struct thin_c *tc);
	2323	static void thin_put(struct thin_c *tc);
	2324
	2325	/*
	2326	* We can't hold rcu_read_lock() around code that can block. So we
	2327	* find a thin with the rcu lock held; bump a refcount; then drop
	2328	* the lock.
	2329	*/
	2330	static struct thin_c get_first_thin(struct pool pool)
	2331	{
	2332	struct thin_c *tc = NULL;
	2333
	2334	rcu_read_lock();
80f130bf KJ	2335	tc = list_first_or_null_rcu(&pool->active_thins, struct thin_c, list);
80f130bf KJ	2336	if (tc)
b10ebd34	2337	thin_get(tc);
b10ebd34 JT	2338	rcu_read_unlock();
	2339
	2340	return tc;
	2341	}
	2342
	2343	static struct thin_c get_next_thin(struct pool pool, struct thin_c *tc)
	2344	{
	2345	struct thin_c *old_tc = tc;
	2346
	2347	rcu_read_lock();
	2348	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
	2349	thin_get(tc);
	2350	thin_put(old_tc);
	2351	rcu_read_unlock();
	2352	return tc;
	2353	}
	2354	thin_put(old_tc);
	2355	rcu_read_unlock();
	2356
	2357	return NULL;
	2358	}
	2359
c140e1c4 MS	2360	static void process_deferred_bios(struct pool *pool)
c140e1c4 MS	2361	{
c140e1c4	2362	struct bio *bio;
4ae280b4	2363	struct bio_list bios, bio_completions;
c140e1c4 MS	2364	struct thin_c *tc;
c140e1c4 MS	2365
b10ebd34 JT	2366	tc = get_first_thin(pool);
b10ebd34 JT	2367	while (tc) {
a374bb21	2368	process_thin_deferred_cells(tc);
c140e1c4	2369	process_thin_deferred_bios(tc);
b10ebd34 JT	2370	tc = get_next_thin(pool, tc);
b10ebd34 JT	2371	}
991d9fa0 JT	2372
991d9fa0 JT	2373	/*
4ae280b4 NT	2374	* If there are any deferred flush bios, we must commit the metadata
4ae280b4 NT	2375	* before issuing them or signaling their completion.
991d9fa0 JT	2376	*/
991d9fa0 JT	2377	bio_list_init(&bios);
4ae280b4 NT	2378	bio_list_init(&bio_completions);
4ae280b4 NT	2379
8e0c9dac	2380	spin_lock_irq(&pool->lock);
991d9fa0 JT	2381	bio_list_merge(&bios, &pool->deferred_flush_bios);
991d9fa0 JT	2382	bio_list_init(&pool->deferred_flush_bios);
4ae280b4 NT	2383
	2384	bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
	2385	bio_list_init(&pool->deferred_flush_completions);
8e0c9dac	2386	spin_unlock_irq(&pool->lock);
991d9fa0	2387
4ae280b4	2388	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
4d1662a3	2389	!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
991d9fa0 JT	2390	return;
991d9fa0 JT	2391
020cc3b5	2392	if (commit(pool)) {
4ae280b4 NT	2393	bio_list_merge(&bios, &bio_completions);
4ae280b4 NT	2394
991d9fa0 JT	2395	while ((bio = bio_list_pop(&bios)))
	2396	bio_io_error(bio);
	2397	return;
	2398	}
905e51b3	2399	pool->last_commit_jiffies = jiffies;
991d9fa0	2400
4ae280b4 NT	2401	while ((bio = bio_list_pop(&bio_completions)))
	2402	bio_endio(bio);
	2403
694cfe7f NT	2404	while ((bio = bio_list_pop(&bios))) {
	2405	/*
	2406	* The data device was flushed as part of metadata commit,
	2407	* so complete redundant flushes immediately.
	2408	*/
	2409	if (bio->bi_opf & REQ_PREFLUSH)
	2410	bio_endio(bio);
	2411	else
b7f8dff0	2412	dm_submit_bio_remap(bio, NULL);
694cfe7f	2413	}
991d9fa0 JT	2414	}
	2415
	2416	static void do_worker(struct work_struct *ws)
	2417	{
	2418	struct pool *pool = container_of(ws, struct pool, worker);
	2419
7d327fe0	2420	throttle_work_start(&pool->throttle);
8a01a6af	2421	dm_pool_issue_prefetches(pool->pmd);
7d327fe0	2422	throttle_work_update(&pool->throttle);
e49e5829	2423	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
7d327fe0	2424	throttle_work_update(&pool->throttle);
e49e5829	2425	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
7d327fe0	2426	throttle_work_update(&pool->throttle);
2a0fbffb JT	2427	process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
2a0fbffb JT	2428	throttle_work_update(&pool->throttle);
991d9fa0	2429	process_deferred_bios(pool);
7d327fe0	2430	throttle_work_complete(&pool->throttle);
991d9fa0 JT	2431	}
991d9fa0 JT	2432
905e51b3 JT	2433	/*
	2434	* We want to commit periodically so that not too much
	2435	* unwritten data builds up.
	2436	*/
	2437	static void do_waker(struct work_struct *ws)
	2438	{
	2439	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
0ef0b471	2440
905e51b3 JT	2441	wake_worker(pool);
	2442	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
	2443	}
	2444
85ad643b JT	2445	/*
	2446	* We're holding onto IO to allow userland time to react. After the
	2447	* timeout either the pool will have been resized (and thus back in
bcc696fa	2448	* PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
85ad643b JT	2449	*/
	2450	static void do_no_space_timeout(struct work_struct *ws)
	2451	{
	2452	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
	2453	no_space_timeout);
	2454
bcc696fa MS	2455	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
bcc696fa MS	2456	pool->pf.error_if_no_space = true;
f6c36758	2457	notify_of_pool_mode_change(pool);
4e4cbee9	2458	error_retry_list_with_code(pool, BLK_STS_NOSPC);
bcc696fa	2459	}
85ad643b JT	2460	}
85ad643b JT	2461
991d9fa0 JT	2462	/----------------------------------------------------------------/
991d9fa0 JT	2463
e7a3e871	2464	struct pool_work {
738211f7	2465	struct work_struct worker;
e7a3e871 JT	2466	struct completion complete;
	2467	};
	2468
	2469	static struct pool_work to_pool_work(struct work_struct ws)
	2470	{
	2471	return container_of(ws, struct pool_work, worker);
	2472	}
	2473
	2474	static void pool_work_complete(struct pool_work *pw)
	2475	{
	2476	complete(&pw->complete);
	2477	}
738211f7	2478
e7a3e871 JT	2479	static void pool_work_wait(struct pool_work pw, struct pool pool,
	2480	void (fn)(struct work_struct ))
	2481	{
	2482	INIT_WORK_ONSTACK(&pw->worker, fn);
	2483	init_completion(&pw->complete);
	2484	queue_work(pool->wq, &pw->worker);
	2485	wait_for_completion(&pw->complete);
e74fa244	2486	destroy_work_on_stack(&pw->worker);
e7a3e871 JT	2487	}
	2488
	2489	/----------------------------------------------------------------/
	2490
	2491	struct noflush_work {
	2492	struct pool_work pw;
	2493	struct thin_c *tc;
738211f7 JT	2494	};
738211f7 JT	2495
e7a3e871	2496	static struct noflush_work to_noflush(struct work_struct ws)
738211f7	2497	{
e7a3e871	2498	return container_of(to_pool_work(ws), struct noflush_work, pw);
738211f7 JT	2499	}
	2500
	2501	static void do_noflush_start(struct work_struct *ws)
	2502	{
e7a3e871	2503	struct noflush_work *w = to_noflush(ws);
0ef0b471	2504
738211f7 JT	2505	w->tc->requeue_mode = true;
738211f7 JT	2506	requeue_io(w->tc);
e7a3e871	2507	pool_work_complete(&w->pw);
738211f7 JT	2508	}
	2509
	2510	static void do_noflush_stop(struct work_struct *ws)
	2511	{
e7a3e871	2512	struct noflush_work *w = to_noflush(ws);
0ef0b471	2513
738211f7	2514	w->tc->requeue_mode = false;
e7a3e871	2515	pool_work_complete(&w->pw);
738211f7 JT	2516	}
	2517
	2518	static void noflush_work(struct thin_c tc, void (fn)(struct work_struct *))
	2519	{
	2520	struct noflush_work w;
	2521
738211f7	2522	w.tc = tc;
e7a3e871	2523	pool_work_wait(&w.pw, tc->pool, fn);
738211f7 JT	2524	}
	2525
	2526	/----------------------------------------------------------------/
	2527
34fbcf62 JT	2528	static void set_discard_callbacks(struct pool *pool)
	2529	{
	2530	struct pool_c *pt = pool->ti->private;
	2531
fa375646	2532	if (pt->adjusted_pf.discard_passdown) {
34fbcf62	2533	pool->process_discard_cell = process_discard_cell_passdown;
2a0fbffb JT	2534	pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
2a0fbffb JT	2535	pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
34fbcf62 JT	2536	} else {
	2537	pool->process_discard_cell = process_discard_cell_no_passdown;
	2538	pool->process_prepared_discard = process_prepared_discard_no_passdown;
	2539	}
	2540	}
	2541
8b64e881	2542	static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
e49e5829	2543	{
cdc2b415	2544	struct pool_c *pt = pool->ti->private;
07f2b6e0 MS	2545	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
07f2b6e0 MS	2546	enum pool_mode old_mode = get_pool_mode(pool);
6aa7de05	2547	unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
07f2b6e0 MS	2548
	2549	/*
	2550	* Never allow the pool to transition to PM_WRITE mode if user
	2551	* intervention is required to verify metadata and data consistency.
	2552	*/
	2553	if (new_mode == PM_WRITE && needs_check) {
	2554	DMERR("%s: unable to switch pool to write mode until repaired.",
	2555	dm_device_name(pool->pool_md));
	2556	if (old_mode != new_mode)
	2557	new_mode = old_mode;
	2558	else
	2559	new_mode = PM_READ_ONLY;
	2560	}
	2561	/*
	2562	* If we were in PM_FAIL mode, rollback of metadata failed. We're
	2563	* not going to recover without a thin_repair. So we never let the
	2564	* pool move out of the old mode.
	2565	*/
	2566	if (old_mode == PM_FAIL)
	2567	new_mode = old_mode;
e49e5829	2568
8b64e881	2569	switch (new_mode) {
e49e5829	2570	case PM_FAIL:
5383ef3a	2571	dm_pool_metadata_read_only(pool->pmd);
e49e5829 JT	2572	pool->process_bio = process_bio_fail;
e49e5829 JT	2573	pool->process_discard = process_bio_fail;
a374bb21 JT	2574	pool->process_cell = process_cell_fail;
a374bb21 JT	2575	pool->process_discard_cell = process_cell_fail;
e49e5829 JT	2576	pool->process_prepared_mapping = process_prepared_mapping_fail;
e49e5829 JT	2577	pool->process_prepared_discard = process_prepared_discard_fail;
3e1a0699 JT	2578
3e1a0699 JT	2579	error_retry_list(pool);
e49e5829 JT	2580	break;
e49e5829 JT	2581
3ab91828	2582	case PM_OUT_OF_METADATA_SPACE:
e49e5829	2583	case PM_READ_ONLY:
3e1a0699 JT	2584	dm_pool_metadata_read_only(pool->pmd);
	2585	pool->process_bio = process_bio_read_only;
	2586	pool->process_discard = process_bio_success;
a374bb21 JT	2587	pool->process_cell = process_cell_read_only;
a374bb21 JT	2588	pool->process_discard_cell = process_cell_success;
3e1a0699	2589	pool->process_prepared_mapping = process_prepared_mapping_fail;
34fbcf62	2590	pool->process_prepared_discard = process_prepared_discard_success;
3e1a0699 JT	2591
	2592	error_retry_list(pool);
	2593	break;
	2594
	2595	case PM_OUT_OF_DATA_SPACE:
	2596	/*
	2597	* Ideally we'd never hit this state; the low water mark
	2598	* would trigger userland to extend the pool before we
	2599	* completely run out of data space. However, many small
	2600	* IOs to unprovisioned space can consume data space at an
	2601	* alarming rate. Adjust your low water mark if you're
	2602	* frequently seeing this mode.
	2603	*/
c3667cc6	2604	pool->out_of_data_space = true;
3e1a0699	2605	pool->process_bio = process_bio_read_only;
a374bb21 JT	2606	pool->process_discard = process_discard_bio;
a374bb21 JT	2607	pool->process_cell = process_cell_read_only;
3e1a0699	2608	pool->process_prepared_mapping = process_prepared_mapping;
34fbcf62	2609	set_discard_callbacks(pool);
85ad643b	2610
80c57893 MS	2611	if (!pool->pf.error_if_no_space && no_space_timeout)
80c57893 MS	2612	queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
e49e5829 JT	2613	break;
	2614
	2615	case PM_WRITE:
75294442 HT	2616	if (old_mode == PM_OUT_OF_DATA_SPACE)
75294442 HT	2617	cancel_delayed_work_sync(&pool->no_space_timeout);
c3667cc6	2618	pool->out_of_data_space = false;
172c2386	2619	pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
9b7aaa64	2620	dm_pool_metadata_read_write(pool->pmd);
e49e5829	2621	pool->process_bio = process_bio;
a374bb21 JT	2622	pool->process_discard = process_discard_bio;
a374bb21 JT	2623	pool->process_cell = process_cell;
e49e5829	2624	pool->process_prepared_mapping = process_prepared_mapping;
34fbcf62	2625	set_discard_callbacks(pool);
e49e5829 JT	2626	break;
e49e5829 JT	2627	}
8b64e881 MS	2628
8b64e881 MS	2629	pool->pf.mode = new_mode;
cdc2b415 MS	2630	/*
	2631	* The pool mode may have changed, sync it so bind_control_target()
	2632	* doesn't cause an unexpected mode transition on resume.
	2633	*/
	2634	pt->adjusted_pf.mode = new_mode;
f6c36758 MS	2635
	2636	if (old_mode != new_mode)
	2637	notify_of_pool_mode_change(pool);
e49e5829 JT	2638	}
e49e5829 JT	2639
07f2b6e0	2640	static void abort_transaction(struct pool *pool)
b5330655	2641	{
07f2b6e0 MS	2642	const char *dev_name = dm_device_name(pool->pool_md);
	2643
	2644	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
	2645	if (dm_pool_abort_metadata(pool->pmd)) {
	2646	DMERR("%s: failed to abort metadata transaction", dev_name);
	2647	set_pool_mode(pool, PM_FAIL);
	2648	}
	2649
	2650	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
	2651	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
	2652	set_pool_mode(pool, PM_FAIL);
	2653	}
	2654	}
399caddf	2655
07f2b6e0 MS	2656	static void metadata_operation_failed(struct pool pool, const char op, int r)
07f2b6e0 MS	2657	{
b5330655 JT	2658	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
	2659	dm_device_name(pool->pool_md), op, r);
	2660
07f2b6e0	2661	abort_transaction(pool);
b5330655 JT	2662	set_pool_mode(pool, PM_READ_ONLY);
	2663	}
	2664
e49e5829 JT	2665	/----------------------------------------------------------------/
e49e5829 JT	2666
991d9fa0 JT	2667	/*
	2668	* Mapping functions.
	2669	*/
	2670
	2671	/*
	2672	* Called only while mapping a thin bio to hand it over to the workqueue.
	2673	*/
	2674	static void thin_defer_bio(struct thin_c tc, struct bio bio)
	2675	{
991d9fa0 JT	2676	struct pool *pool = tc->pool;
991d9fa0 JT	2677
8e0c9dac	2678	spin_lock_irq(&tc->lock);
c140e1c4	2679	bio_list_add(&tc->deferred_bio_list, bio);
8e0c9dac	2680	spin_unlock_irq(&tc->lock);
991d9fa0 JT	2681
	2682	wake_worker(pool);
	2683	}
	2684
7d327fe0 JT	2685	static void thin_defer_bio_with_throttle(struct thin_c tc, struct bio bio)
	2686	{
	2687	struct pool *pool = tc->pool;
	2688
	2689	throttle_lock(&pool->throttle);
	2690	thin_defer_bio(tc, bio);
	2691	throttle_unlock(&pool->throttle);
	2692	}
	2693
a374bb21 JT	2694	static void thin_defer_cell(struct thin_c tc, struct dm_bio_prison_cell cell)
a374bb21 JT	2695	{
a374bb21 JT	2696	struct pool *pool = tc->pool;
	2697
	2698	throttle_lock(&pool->throttle);
8e0c9dac	2699	spin_lock_irq(&tc->lock);
a374bb21	2700	list_add_tail(&cell->user_list, &tc->deferred_cells);
8e0c9dac	2701	spin_unlock_irq(&tc->lock);
a374bb21 JT	2702	throttle_unlock(&pool->throttle);
	2703
	2704	wake_worker(pool);
	2705	}
	2706
59c3d2c6	2707	static void thin_hook_bio(struct thin_c tc, struct bio bio)
eb2aa48d	2708	{
59c3d2c6	2709	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d JT	2710
	2711	h->tc = tc;
	2712	h->shared_read_entry = NULL;
e8088073	2713	h->all_io_entry = NULL;
eb2aa48d	2714	h->overwrite_mapping = NULL;
34fbcf62	2715	h->cell = NULL;
eb2aa48d JT	2716	}
eb2aa48d JT	2717
991d9fa0 JT	2718	/*
	2719	* Non-blocking function called from the thin target's map function.
	2720	*/
7de3ee57	2721	static int thin_bio_map(struct dm_target ti, struct bio bio)
991d9fa0 JT	2722	{
	2723	int r;
	2724	struct thin_c *tc = ti->private;
	2725	dm_block_t block = get_bio_block(tc, bio);
	2726	struct dm_thin_device *td = tc->td;
	2727	struct dm_thin_lookup_result result;
a374bb21	2728	struct dm_bio_prison_cell virt_cell, data_cell;
e8088073	2729	struct dm_cell_key key;
991d9fa0	2730
59c3d2c6	2731	thin_hook_bio(tc, bio);
e49e5829	2732
738211f7	2733	if (tc->requeue_mode) {
4e4cbee9	2734	bio->bi_status = BLK_STS_DM_REQUEUE;
4246a0b6	2735	bio_endio(bio);
738211f7 JT	2736	return DM_MAPIO_SUBMITTED;
	2737	}
	2738
e49e5829 JT	2739	if (get_pool_mode(tc->pool) == PM_FAIL) {
	2740	bio_io_error(bio);
	2741	return DM_MAPIO_SUBMITTED;
	2742	}
	2743
f73f44eb	2744	if (op_is_flush(bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD) {
7d327fe0	2745	thin_defer_bio_with_throttle(tc, bio);
991d9fa0 JT	2746	return DM_MAPIO_SUBMITTED;
	2747	}
	2748
c822ed96 JT	2749	/*
	2750	* We must hold the virtual cell before doing the lookup, otherwise
	2751	* there's a race with discard.
	2752	*/
	2753	build_virtual_key(tc->td, block, &key);
a374bb21	2754	if (bio_detain(tc->pool, &key, bio, &virt_cell))
c822ed96 JT	2755	return DM_MAPIO_SUBMITTED;
c822ed96 JT	2756
991d9fa0 JT	2757	r = dm_thin_find_block(td, block, 0, &result);
	2758
	2759	/*
	2760	* Note that we defer readahead too.
	2761	*/
	2762	switch (r) {
	2763	case 0:
	2764	if (unlikely(result.shared)) {
	2765	/*
	2766	* We have a race condition here between the
	2767	* result.shared value returned by the lookup and
	2768	* snapshot creation, which may cause new
	2769	* sharing.
	2770	*
	2771	* To avoid this always quiesce the origin before
	2772	* taking the snap. You want to do this anyway to
	2773	* ensure a consistent application view
	2774	* (i.e. lockfs).
	2775	*
	2776	* More distant ancestors are irrelevant. The
	2777	* shared flag will be set in their case.
	2778	*/
a374bb21	2779	thin_defer_cell(tc, virt_cell);
e8088073	2780	return DM_MAPIO_SUBMITTED;
991d9fa0	2781	}
e8088073	2782
e8088073	2783	build_data_key(tc->td, result.block, &key);
a374bb21 JT	2784	if (bio_detain(tc->pool, &key, bio, &data_cell)) {
a374bb21 JT	2785	cell_defer_no_holder(tc, virt_cell);
e8088073 JT	2786	return DM_MAPIO_SUBMITTED;
	2787	}
	2788
	2789	inc_all_io_entry(tc->pool, bio);
a374bb21 JT	2790	cell_defer_no_holder(tc, data_cell);
a374bb21 JT	2791	cell_defer_no_holder(tc, virt_cell);
e8088073 JT	2792
	2793	remap(tc, bio, result.block);
	2794	return DM_MAPIO_REMAPPED;
991d9fa0 JT	2795
991d9fa0 JT	2796	case -ENODATA:
e49e5829	2797	case -EWOULDBLOCK:
a374bb21	2798	thin_defer_cell(tc, virt_cell);
2aab3850	2799	return DM_MAPIO_SUBMITTED;
e49e5829 JT	2800
	2801	default:
	2802	/*
	2803	* Must always call bio_io_error on failure.
	2804	* dm_thin_find_block can fail with -EINVAL if the
	2805	* pool is switched to fail-io mode.
	2806	*/
	2807	bio_io_error(bio);
a374bb21	2808	cell_defer_no_holder(tc, virt_cell);
2aab3850	2809	return DM_MAPIO_SUBMITTED;
991d9fa0	2810	}
991d9fa0 JT	2811	}
991d9fa0 JT	2812
c140e1c4	2813	static void requeue_bios(struct pool *pool)
991d9fa0	2814	{
c140e1c4 MS	2815	struct thin_c *tc;
	2816
	2817	rcu_read_lock();
	2818	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
8e0c9dac	2819	spin_lock_irq(&tc->lock);
c140e1c4 MS	2820	bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
c140e1c4 MS	2821	bio_list_init(&tc->retry_on_resume_list);
8e0c9dac	2822	spin_unlock_irq(&tc->lock);
c140e1c4 MS	2823	}
c140e1c4 MS	2824	rcu_read_unlock();
991d9fa0 JT	2825	}
991d9fa0 JT	2826
a4a82ce3 HM	2827	/*
a4a82ce3 HM	2828	*--------------------------------------------------------------
991d9fa0	2829	* Binding of control targets to a pool object
a4a82ce3 HM	2830	*--------------------------------------------------------------
a4a82ce3 HM	2831	*/
58051b94 JT	2832	static bool is_factor(sector_t block_size, uint32_t n)
	2833	{
	2834	return !sector_div(block_size, n);
	2835	}
	2836
9bc142dd MS	2837	/*
9bc142dd MS	2838	* If discard_passdown was enabled verify that the data device
0424caa1	2839	* supports discards. Disable discard_passdown if not.
9bc142dd	2840	*/
fa375646	2841	static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
9bc142dd	2842	{
0424caa1 MS	2843	struct pool *pool = pt->pool;
0424caa1 MS	2844	struct block_device *data_bdev = pt->data_dev->bdev;
2f5a65ef	2845	struct queue_limits *data_limits = bdev_limits(data_bdev);
0424caa1	2846	const char *reason = NULL;
9bc142dd	2847
0424caa1	2848	if (!pt->adjusted_pf.discard_passdown)
9bc142dd MS	2849	return;
9bc142dd MS	2850
70200574	2851	if (!bdev_max_discard_sectors(pt->data_dev->bdev))
0424caa1 MS	2852	reason = "discard unsupported";
	2853
	2854	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
	2855	reason = "max discard sectors smaller than a block";
9bc142dd	2856
0424caa1	2857	if (reason) {
385411ff	2858	DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason);
0424caa1 MS	2859	pt->adjusted_pf.discard_passdown = false;
0424caa1 MS	2860	}
9bc142dd MS	2861	}
9bc142dd MS	2862
991d9fa0 JT	2863	static int bind_control_target(struct pool pool, struct dm_target ti)
	2864	{
	2865	struct pool_c *pt = ti->private;
	2866
e49e5829	2867	/*
9b7aaa64	2868	* We want to make sure that a pool in PM_FAIL mode is never upgraded.
e49e5829	2869	*/
07f2b6e0	2870	enum pool_mode old_mode = get_pool_mode(pool);
0424caa1	2871	enum pool_mode new_mode = pt->adjusted_pf.mode;
e49e5829	2872
8b64e881 MS	2873	/*
	2874	* Don't change the pool's mode until set_pool_mode() below.
	2875	* Otherwise the pool's process_* function pointers may
	2876	* not match the desired pool mode.
	2877	*/
	2878	pt->adjusted_pf.mode = old_mode;
	2879
	2880	pool->ti = ti;
	2881	pool->pf = pt->adjusted_pf;
	2882	pool->low_water_blocks = pt->low_water_blocks;
	2883
9bc142dd	2884	set_pool_mode(pool, new_mode);
f402693d	2885
991d9fa0 JT	2886	return 0;
	2887	}
	2888
	2889	static void unbind_control_target(struct pool pool, struct dm_target ti)
	2890	{
	2891	if (pool->ti == ti)
	2892	pool->ti = NULL;
	2893	}
	2894
a4a82ce3 HM	2895	/*
a4a82ce3 HM	2896	*--------------------------------------------------------------
991d9fa0	2897	* Pool creation
a4a82ce3 HM	2898	*--------------------------------------------------------------
a4a82ce3 HM	2899	*/
67e2e2b2 JT	2900	/* Initialize pool features. */
	2901	static void pool_features_init(struct pool_features *pf)
	2902	{
e49e5829	2903	pf->mode = PM_WRITE;
9bc142dd MS	2904	pf->zero_new_blocks = true;
	2905	pf->discard_enabled = true;
	2906	pf->discard_passdown = true;
787a996c	2907	pf->error_if_no_space = false;
67e2e2b2 JT	2908	}
67e2e2b2 JT	2909
991d9fa0 JT	2910	static void __pool_destroy(struct pool *pool)
	2911	{
	2912	__pool_table_remove(pool);
	2913
a822c83e	2914	vfree(pool->cell_sort_array);
991d9fa0 JT	2915	if (dm_pool_metadata_close(pool->pmd) < 0)
	2916	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
	2917
44feb387	2918	dm_bio_prison_destroy(pool->prison);
991d9fa0 JT	2919	dm_kcopyd_client_destroy(pool->copier);
991d9fa0 JT	2920
88430ebc LM	2921	cancel_delayed_work_sync(&pool->waker);
88430ebc LM	2922	cancel_delayed_work_sync(&pool->no_space_timeout);
991d9fa0 JT	2923	if (pool->wq)
	2924	destroy_workqueue(pool->wq);
	2925
	2926	if (pool->next_mapping)
6f1c819c KO	2927	mempool_free(pool->next_mapping, &pool->mapping_pool);
6f1c819c KO	2928	mempool_exit(&pool->mapping_pool);
44feb387 MS	2929	dm_deferred_set_destroy(pool->shared_read_ds);
44feb387 MS	2930	dm_deferred_set_destroy(pool->all_io_ds);
991d9fa0 JT	2931	kfree(pool);
	2932	}
	2933
a24c2569	2934	static struct kmem_cache *_new_mapping_cache;
a24c2569	2935
991d9fa0 JT	2936	static struct pool pool_create(struct mapped_device pool_md,
991d9fa0 JT	2937	struct block_device *metadata_dev,
873937e7	2938	struct block_device *data_dev,
e49e5829 JT	2939	unsigned long block_size,
e49e5829 JT	2940	int read_only, char **error)
991d9fa0 JT	2941	{
	2942	int r;
	2943	void *err_p;
	2944	struct pool *pool;
	2945	struct dm_pool_metadata *pmd;
e49e5829	2946	bool format_device = read_only ? false : true;
991d9fa0	2947
e49e5829	2948	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
991d9fa0 JT	2949	if (IS_ERR(pmd)) {
991d9fa0 JT	2950	*error = "Error creating metadata object";
00565cff	2951	return ERR_CAST(pmd);
991d9fa0 JT	2952	}
991d9fa0 JT	2953
d3775354	2954	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
991d9fa0 JT	2955	if (!pool) {
	2956	*error = "Error allocating memory for pool";
	2957	err_p = ERR_PTR(-ENOMEM);
	2958	goto bad_pool;
	2959	}
	2960
	2961	pool->pmd = pmd;
	2962	pool->sectors_per_block = block_size;
f9a8e0cd MP	2963	if (block_size & (block_size - 1))
	2964	pool->sectors_per_block_shift = -1;
	2965	else
	2966	pool->sectors_per_block_shift = __ffs(block_size);
991d9fa0	2967	pool->low_water_blocks = 0;
67e2e2b2	2968	pool_features_init(&pool->pf);
a195db2d	2969	pool->prison = dm_bio_prison_create();
991d9fa0 JT	2970	if (!pool->prison) {
	2971	*error = "Error creating pool's bio prison";
	2972	err_p = ERR_PTR(-ENOMEM);
	2973	goto bad_prison;
	2974	}
	2975
df5d2e90	2976	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
991d9fa0 JT	2977	if (IS_ERR(pool->copier)) {
	2978	r = PTR_ERR(pool->copier);
	2979	*error = "Error creating pool's kcopyd client";
	2980	err_p = ERR_PTR(r);
	2981	goto bad_kcopyd_client;
	2982	}
	2983
	2984	/*
	2985	* Create singlethreaded workqueue that will service all devices
	2986	* that use this metadata.
	2987	*/
	2988	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
	2989	if (!pool->wq) {
	2990	*error = "Error creating pool's workqueue";
	2991	err_p = ERR_PTR(-ENOMEM);
	2992	goto bad_wq;
	2993	}
	2994
7d327fe0	2995	throttle_init(&pool->throttle);
991d9fa0	2996	INIT_WORK(&pool->worker, do_worker);
905e51b3	2997	INIT_DELAYED_WORK(&pool->waker, do_waker);
85ad643b	2998	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
991d9fa0	2999	spin_lock_init(&pool->lock);
991d9fa0	3000	bio_list_init(&pool->deferred_flush_bios);
4ae280b4	3001	bio_list_init(&pool->deferred_flush_completions);
991d9fa0	3002	INIT_LIST_HEAD(&pool->prepared_mappings);
104655fd	3003	INIT_LIST_HEAD(&pool->prepared_discards);
2a0fbffb	3004	INIT_LIST_HEAD(&pool->prepared_discards_pt2);
c140e1c4	3005	INIT_LIST_HEAD(&pool->active_thins);
88a6621b	3006	pool->low_water_triggered = false;
80e96c54	3007	pool->suspended = true;
c3667cc6	3008	pool->out_of_data_space = false;
44feb387 MS	3009
	3010	pool->shared_read_ds = dm_deferred_set_create();
	3011	if (!pool->shared_read_ds) {
	3012	*error = "Error creating pool's shared read deferred set";
	3013	err_p = ERR_PTR(-ENOMEM);
	3014	goto bad_shared_read_ds;
	3015	}
	3016
	3017	pool->all_io_ds = dm_deferred_set_create();
	3018	if (!pool->all_io_ds) {
	3019	*error = "Error creating pool's all io deferred set";
	3020	err_p = ERR_PTR(-ENOMEM);
	3021	goto bad_all_io_ds;
	3022	}
991d9fa0 JT	3023
991d9fa0 JT	3024	pool->next_mapping = NULL;
6f1c819c KO	3025	r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
	3026	_new_mapping_cache);
	3027	if (r) {
991d9fa0	3028	*error = "Error creating pool's mapping mempool";
6f1c819c	3029	err_p = ERR_PTR(r);
991d9fa0 JT	3030	goto bad_mapping_pool;
	3031	}
	3032
42bc47b3 KC	3033	pool->cell_sort_array =
	3034	vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
	3035	sizeof(*pool->cell_sort_array)));
a822c83e JT	3036	if (!pool->cell_sort_array) {
	3037	*error = "Error allocating cell sort array";
	3038	err_p = ERR_PTR(-ENOMEM);
	3039	goto bad_sort_array;
	3040	}
	3041
991d9fa0	3042	pool->ref_count = 1;
905e51b3	3043	pool->last_commit_jiffies = jiffies;
991d9fa0 JT	3044	pool->pool_md = pool_md;
991d9fa0 JT	3045	pool->md_dev = metadata_dev;
873937e7	3046	pool->data_dev = data_dev;
991d9fa0 JT	3047	__pool_table_insert(pool);
	3048
	3049	return pool;
	3050
a822c83e	3051	bad_sort_array:
6f1c819c	3052	mempool_exit(&pool->mapping_pool);
991d9fa0	3053	bad_mapping_pool:
44feb387 MS	3054	dm_deferred_set_destroy(pool->all_io_ds);
	3055	bad_all_io_ds:
	3056	dm_deferred_set_destroy(pool->shared_read_ds);
	3057	bad_shared_read_ds:
991d9fa0 JT	3058	destroy_workqueue(pool->wq);
	3059	bad_wq:
	3060	dm_kcopyd_client_destroy(pool->copier);
	3061	bad_kcopyd_client:
44feb387	3062	dm_bio_prison_destroy(pool->prison);
991d9fa0 JT	3063	bad_prison:
	3064	kfree(pool);
	3065	bad_pool:
	3066	if (dm_pool_metadata_close(pmd))
	3067	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
	3068
	3069	return err_p;
	3070	}
	3071
	3072	static void __pool_inc(struct pool *pool)
	3073	{
	3074	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	3075	pool->ref_count++;
	3076	}
	3077
	3078	static void __pool_dec(struct pool *pool)
	3079	{
	3080	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	3081	BUG_ON(!pool->ref_count);
	3082	if (!--pool->ref_count)
	3083	__pool_destroy(pool);
	3084	}
	3085
	3086	static struct pool __pool_find(struct mapped_device pool_md,
	3087	struct block_device *metadata_dev,
873937e7	3088	struct block_device *data_dev,
e49e5829 JT	3089	unsigned long block_size, int read_only,
e49e5829 JT	3090	char *error, int created)
991d9fa0 JT	3091	{
	3092	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
	3093
	3094	if (pool) {
f09996c9 MS	3095	if (pool->pool_md != pool_md) {
f09996c9 MS	3096	*error = "metadata device already in use by a pool";
991d9fa0	3097	return ERR_PTR(-EBUSY);
f09996c9	3098	}
873937e7 MP	3099	if (pool->data_dev != data_dev) {
	3100	*error = "data device already in use by a pool";
	3101	return ERR_PTR(-EBUSY);
	3102	}
991d9fa0 JT	3103	__pool_inc(pool);
	3104
	3105	} else {
	3106	pool = __pool_table_lookup(pool_md);
	3107	if (pool) {
873937e7	3108	if (pool->md_dev != metadata_dev \|\| pool->data_dev != data_dev) {
f09996c9	3109	*error = "different pool cannot replace a pool";
991d9fa0	3110	return ERR_PTR(-EINVAL);
f09996c9	3111	}
991d9fa0 JT	3112	__pool_inc(pool);
991d9fa0 JT	3113
67e2e2b2	3114	} else {
873937e7	3115	pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
67e2e2b2 JT	3116	*created = 1;
67e2e2b2 JT	3117	}
991d9fa0 JT	3118	}
	3119
	3120	return pool;
	3121	}
	3122
a4a82ce3 HM	3123	/*
a4a82ce3 HM	3124	*--------------------------------------------------------------
991d9fa0	3125	* Pool target methods
a4a82ce3 HM	3126	*--------------------------------------------------------------
a4a82ce3 HM	3127	*/
991d9fa0 JT	3128	static void pool_dtr(struct dm_target *ti)
	3129	{
	3130	struct pool_c *pt = ti->private;
	3131
	3132	mutex_lock(&dm_thin_pool_table.mutex);
	3133
	3134	unbind_control_target(pt->pool, ti);
	3135	__pool_dec(pt->pool);
	3136	dm_put_device(ti, pt->metadata_dev);
	3137	dm_put_device(ti, pt->data_dev);
	3138	kfree(pt);
	3139
	3140	mutex_unlock(&dm_thin_pool_table.mutex);
	3141	}
	3142
991d9fa0 JT	3143	static int parse_pool_features(struct dm_arg_set as, struct pool_features pf,
	3144	struct dm_target *ti)
	3145	{
	3146	int r;
86a3238c	3147	unsigned int argc;
991d9fa0 JT	3148	const char *arg_name;
991d9fa0 JT	3149
5916a22b	3150	static const struct dm_arg _args[] = {
74aa45c3	3151	{0, 4, "Invalid number of pool feature arguments"},
991d9fa0 JT	3152	};
	3153
	3154	/*
	3155	* No feature arguments supplied.
	3156	*/
	3157	if (!as->argc)
	3158	return 0;
	3159
	3160	r = dm_read_arg_group(_args, as, &argc, &ti->error);
	3161	if (r)
	3162	return -EINVAL;
	3163
	3164	while (argc && !r) {
	3165	arg_name = dm_shift_arg(as);
	3166	argc--;
	3167
e49e5829	3168	if (!strcasecmp(arg_name, "skip_block_zeroing"))
9bc142dd	3169	pf->zero_new_blocks = false;
e49e5829 JT	3170
e49e5829 JT	3171	else if (!strcasecmp(arg_name, "ignore_discard"))
9bc142dd	3172	pf->discard_enabled = false;
e49e5829 JT	3173
e49e5829 JT	3174	else if (!strcasecmp(arg_name, "no_discard_passdown"))
9bc142dd	3175	pf->discard_passdown = false;
991d9fa0	3176
e49e5829 JT	3177	else if (!strcasecmp(arg_name, "read_only"))
	3178	pf->mode = PM_READ_ONLY;
	3179
787a996c MS	3180	else if (!strcasecmp(arg_name, "error_if_no_space"))
	3181	pf->error_if_no_space = true;
	3182
e49e5829 JT	3183	else {
	3184	ti->error = "Unrecognised pool feature requested";
	3185	r = -EINVAL;
	3186	break;
	3187	}
991d9fa0 JT	3188	}
	3189
	3190	return r;
	3191	}
	3192
ac8c3f3d JT	3193	static void metadata_low_callback(void *context)
	3194	{
	3195	struct pool *pool = context;
	3196
	3197	DMWARN("%s: reached low water mark for metadata device: sending event.",
	3198	dm_device_name(pool->pool_md));
	3199
	3200	dm_table_event(pool->ti->table);
	3201	}
	3202
694cfe7f NT	3203	/*
	3204	* We need to flush the data device before committing the metadata.
	3205	*
	3206	* This ensures that the data blocks of any newly inserted mappings are
	3207	* properly written to non-volatile storage and won't be lost in case of a
	3208	* crash.
	3209	*
	3210	* Failure to do so can result in data corruption in the case of internal or
	3211	* external snapshots and in the case of newly provisioned blocks, when block
	3212	* zeroing is enabled.
	3213	*/
	3214	static int metadata_pre_commit_callback(void *context)
	3215	{
f06c03d1	3216	struct pool *pool = context;
694cfe7f	3217
28d7d128	3218	return blkdev_issue_flush(pool->data_dev);
694cfe7f NT	3219	}
694cfe7f NT	3220
7d48935e MS	3221	static sector_t get_dev_size(struct block_device *bdev)
7d48935e MS	3222	{
6dcbb52c	3223	return bdev_nr_sectors(bdev);
7d48935e MS	3224	}
	3225
	3226	static void warn_if_metadata_device_too_big(struct block_device *bdev)
b17446df	3227	{
7d48935e	3228	sector_t metadata_dev_size = get_dev_size(bdev);
b17446df	3229
7d48935e	3230	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
385411ff CH	3231	DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
385411ff CH	3232	bdev, THIN_METADATA_MAX_SECTORS);
7d48935e MS	3233	}
	3234
	3235	static sector_t get_metadata_dev_size(struct block_device *bdev)
	3236	{
	3237	sector_t metadata_dev_size = get_dev_size(bdev);
	3238
	3239	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
	3240	metadata_dev_size = THIN_METADATA_MAX_SECTORS;
b17446df JT	3241
	3242	return metadata_dev_size;
	3243	}
	3244
24347e95 JT	3245	static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
	3246	{
	3247	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
	3248
7d48935e	3249	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
24347e95 JT	3250
	3251	return metadata_dev_size;
	3252	}
	3253
ac8c3f3d JT	3254	/*
	3255	* When a metadata threshold is crossed a dm event is triggered, and
	3256	* userland should respond by growing the metadata device. We could let
	3257	* userland set the threshold, like we do with the data threshold, but I'm
	3258	* not sure they know enough to do this well.
	3259	*/
	3260	static dm_block_t calc_metadata_threshold(struct pool_c *pt)
	3261	{
	3262	/*
	3263	* 4M is ample for all ops with the possible exception of thin
	3264	* device deletion which is harmless if it fails (just retry the
	3265	* delete after you've grown the device).
	3266	*/
	3267	dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
0ef0b471	3268
ac8c3f3d JT	3269	return min((dm_block_t)1024ULL /* 4M */, quarter);
	3270	}
	3271
991d9fa0 JT	3272	/*
	3273	* thin-pool <metadata dev> <data dev>
	3274	* <data block size (sectors)>
	3275	* <low water mark (blocks)>
	3276	* [<#feature args> [<arg>]*]
	3277	*
	3278	* Optional feature arguments are:
	3279	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
67e2e2b2 JT	3280	* ignore_discard: disable discard
67e2e2b2 JT	3281	* no_discard_passdown: don't pass discards down to the data device
787a996c MS	3282	* read_only: Don't allow any changes to be made to the pool metadata.
787a996c MS	3283	* error_if_no_space: error IOs, instead of queueing, if no space.
991d9fa0	3284	*/
86a3238c	3285	static int pool_ctr(struct dm_target ti, unsigned int argc, char *argv)
991d9fa0	3286	{
67e2e2b2	3287	int r, pool_created = 0;
991d9fa0 JT	3288	struct pool_c *pt;
	3289	struct pool *pool;
	3290	struct pool_features pf;
	3291	struct dm_arg_set as;
	3292	struct dm_dev *data_dev;
	3293	unsigned long block_size;
	3294	dm_block_t low_water_blocks;
	3295	struct dm_dev *metadata_dev;
05bdb996	3296	blk_mode_t metadata_mode;
991d9fa0 JT	3297
	3298	/*
	3299	* FIXME Remove validation from scope of lock.
	3300	*/
	3301	mutex_lock(&dm_thin_pool_table.mutex);
	3302
	3303	if (argc < 4) {
	3304	ti->error = "Invalid argument count";
	3305	r = -EINVAL;
	3306	goto out_unlock;
	3307	}
5d0db96d	3308
991d9fa0 JT	3309	as.argc = argc;
	3310	as.argv = argv;
	3311
70de2cbd JCXF	3312	/* make sure metadata and data are different devices */
	3313	if (!strcmp(argv[0], argv[1])) {
	3314	ti->error = "Error setting metadata or data device";
	3315	r = -EINVAL;
	3316	goto out_unlock;
	3317	}
	3318
5d0db96d JT	3319	/*
	3320	* Set default pool features.
	3321	*/
	3322	pool_features_init(&pf);
	3323
	3324	dm_consume_args(&as, 4);
	3325	r = parse_pool_features(&as, &pf, ti);
	3326	if (r)
	3327	goto out_unlock;
	3328
05bdb996 CH	3329	metadata_mode = BLK_OPEN_READ \|
05bdb996 CH	3330	((pf.mode == PM_READ_ONLY) ? 0 : BLK_OPEN_WRITE);
5d0db96d	3331	r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
991d9fa0 JT	3332	if (r) {
	3333	ti->error = "Error opening metadata block device";
	3334	goto out_unlock;
	3335	}
7d48935e	3336	warn_if_metadata_device_too_big(metadata_dev->bdev);
991d9fa0	3337
05bdb996	3338	r = dm_get_device(ti, argv[1], BLK_OPEN_READ \| BLK_OPEN_WRITE, &data_dev);
991d9fa0 JT	3339	if (r) {
	3340	ti->error = "Error getting data device";
	3341	goto out_metadata;
	3342	}
	3343
	3344	if (kstrtoul(argv[2], 10, &block_size) \|\| !block_size \|\|
	3345	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
	3346	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
55f2b8bd	3347	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
991d9fa0 JT	3348	ti->error = "Invalid block size";
	3349	r = -EINVAL;
	3350	goto out;
	3351	}
	3352
	3353	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
	3354	ti->error = "Invalid low water mark";
	3355	r = -EINVAL;
	3356	goto out;
	3357	}
	3358
991d9fa0 JT	3359	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
	3360	if (!pt) {
	3361	r = -ENOMEM;
	3362	goto out;
	3363	}
	3364
873937e7	3365	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
e49e5829	3366	block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
991d9fa0 JT	3367	if (IS_ERR(pool)) {
	3368	r = PTR_ERR(pool);
	3369	goto out_free_pt;
	3370	}
	3371
67e2e2b2 JT	3372	/*
	3373	* 'pool_created' reflects whether this is the first table load.
	3374	* Top level discard support is not allowed to be changed after
	3375	* initial load. This would require a pool reload to trigger thin
	3376	* device changes.
	3377	*/
	3378	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
	3379	ti->error = "Discard support cannot be disabled once enabled";
	3380	r = -EINVAL;
	3381	goto out_flags_changed;
	3382	}
	3383
991d9fa0 JT	3384	pt->pool = pool;
	3385	pt->ti = ti;
	3386	pt->metadata_dev = metadata_dev;
	3387	pt->data_dev = data_dev;
	3388	pt->low_water_blocks = low_water_blocks;
0424caa1	3389	pt->adjusted_pf = pt->requested_pf = pf;
55a62eef	3390	ti->num_flush_bios = 1;
9bbf5fee	3391	ti->limit_swap_bios = true;
9bc142dd	3392
67e2e2b2 JT	3393	/*
	3394	* Only need to enable discards if the pool should pass
	3395	* them down to the data device. The thin device's discard
	3396	* processing will cause mappings to be removed from the btree.
	3397	*/
	3398	if (pf.discard_enabled && pf.discard_passdown) {
55a62eef	3399	ti->num_discard_bios = 1;
67e2e2b2 JT	3400	/*
	3401	* Setting 'discards_supported' circumvents the normal
	3402	* stacking of discard limits (this keeps the pool and
	3403	* thin devices' discard limits consistent).
	3404	*/
0ac55489	3405	ti->discards_supported = true;
e2dd8aca	3406	ti->max_discard_granularity = true;
67e2e2b2	3407	}
991d9fa0 JT	3408	ti->private = pt;
991d9fa0 JT	3409
ac8c3f3d JT	3410	r = dm_pool_register_metadata_threshold(pt->pool->pmd,
	3411	calc_metadata_threshold(pt),
	3412	metadata_low_callback,
	3413	pool);
3534e5a5 LM	3414	if (r) {
3534e5a5 LM	3415	ti->error = "Error registering metadata threshold";
ba30670f	3416	goto out_flags_changed;
3534e5a5	3417	}
ac8c3f3d	3418
f06c03d1 MP	3419	dm_pool_register_pre_commit_callback(pool->pmd,
	3420	metadata_pre_commit_callback, pool);
	3421
991d9fa0 JT	3422	mutex_unlock(&dm_thin_pool_table.mutex);
	3423
	3424	return 0;
	3425
67e2e2b2 JT	3426	out_flags_changed:
67e2e2b2 JT	3427	__pool_dec(pool);
991d9fa0 JT	3428	out_free_pt:
	3429	kfree(pt);
	3430	out:
	3431	dm_put_device(ti, data_dev);
	3432	out_metadata:
	3433	dm_put_device(ti, metadata_dev);
	3434	out_unlock:
	3435	mutex_unlock(&dm_thin_pool_table.mutex);
	3436
	3437	return r;
	3438	}
	3439
7de3ee57	3440	static int pool_map(struct dm_target ti, struct bio bio)
991d9fa0	3441	{
991d9fa0 JT	3442	struct pool_c *pt = ti->private;
991d9fa0 JT	3443	struct pool *pool = pt->pool;
991d9fa0 JT	3444
	3445	/*
	3446	* As this is a singleton target, ti->begin is always zero.
	3447	*/
8e0c9dac	3448	spin_lock_irq(&pool->lock);
74d46992	3449	bio_set_dev(bio, pt->data_dev->bdev);
8e0c9dac	3450	spin_unlock_irq(&pool->lock);
991d9fa0	3451
c0a7a0ac	3452	return DM_MAPIO_REMAPPED;
991d9fa0 JT	3453	}
991d9fa0 JT	3454
b17446df	3455	static int maybe_resize_data_dev(struct dm_target ti, bool need_commit)
991d9fa0 JT	3456	{
	3457	int r;
	3458	struct pool_c *pt = ti->private;
	3459	struct pool *pool = pt->pool;
55f2b8bd MS	3460	sector_t data_size = ti->len;
55f2b8bd MS	3461	dm_block_t sb_data_size;
991d9fa0	3462
b17446df	3463	*need_commit = false;
991d9fa0	3464
55f2b8bd MS	3465	(void) sector_div(data_size, pool->sectors_per_block);
55f2b8bd MS	3466
991d9fa0 JT	3467	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
991d9fa0 JT	3468	if (r) {
4fa5971a MS	3469	DMERR("%s: failed to retrieve data device size",
4fa5971a MS	3470	dm_device_name(pool->pool_md));
991d9fa0 JT	3471	return r;
	3472	}
	3473
	3474	if (data_size < sb_data_size) {
4fa5971a MS	3475	DMERR("%s: pool target (%llu blocks) too small: expected %llu",
4fa5971a MS	3476	dm_device_name(pool->pool_md),
55f2b8bd	3477	(unsigned long long)data_size, sb_data_size);
991d9fa0 JT	3478	return -EINVAL;
	3479
	3480	} else if (data_size > sb_data_size) {
07f2b6e0 MS	3481	if (dm_pool_metadata_needs_check(pool->pmd)) {
	3482	DMERR("%s: unable to grow the data device until repaired.",
	3483	dm_device_name(pool->pool_md));
	3484	return 0;
	3485	}
	3486
6f7f51d4 MS	3487	if (sb_data_size)
	3488	DMINFO("%s: growing the data device from %llu to %llu blocks",
	3489	dm_device_name(pool->pool_md),
	3490	sb_data_size, (unsigned long long)data_size);
991d9fa0 JT	3491	r = dm_pool_resize_data_dev(pool->pmd, data_size);
991d9fa0 JT	3492	if (r) {
b5330655	3493	metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
991d9fa0 JT	3494	return r;
	3495	}
	3496
b17446df	3497	*need_commit = true;
991d9fa0 JT	3498	}
	3499
	3500	return 0;
	3501	}
	3502
24347e95 JT	3503	static int maybe_resize_metadata_dev(struct dm_target ti, bool need_commit)
	3504	{
	3505	int r;
	3506	struct pool_c *pt = ti->private;
	3507	struct pool *pool = pt->pool;
	3508	dm_block_t metadata_dev_size, sb_metadata_dev_size;
	3509
	3510	*need_commit = false;
	3511
610bba8b	3512	metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
24347e95 JT	3513
	3514	r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
	3515	if (r) {
4fa5971a MS	3516	DMERR("%s: failed to retrieve metadata device size",
4fa5971a MS	3517	dm_device_name(pool->pool_md));
24347e95 JT	3518	return r;
	3519	}
	3520
	3521	if (metadata_dev_size < sb_metadata_dev_size) {
4fa5971a MS	3522	DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
4fa5971a MS	3523	dm_device_name(pool->pool_md),
24347e95 JT	3524	metadata_dev_size, sb_metadata_dev_size);
	3525	return -EINVAL;
	3526
	3527	} else if (metadata_dev_size > sb_metadata_dev_size) {
07f2b6e0 MS	3528	if (dm_pool_metadata_needs_check(pool->pmd)) {
	3529	DMERR("%s: unable to grow the metadata device until repaired.",
	3530	dm_device_name(pool->pool_md));
	3531	return 0;
	3532	}
	3533
7d48935e	3534	warn_if_metadata_device_too_big(pool->md_dev);
6f7f51d4 MS	3535	DMINFO("%s: growing the metadata device from %llu to %llu blocks",
	3536	dm_device_name(pool->pool_md),
	3537	sb_metadata_dev_size, metadata_dev_size);
3ab91828 JT	3538
	3539	if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
	3540	set_pool_mode(pool, PM_WRITE);
	3541
24347e95 JT	3542	r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
24347e95 JT	3543	if (r) {
b5330655	3544	metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
24347e95 JT	3545	return r;
	3546	}
	3547
	3548	*need_commit = true;
	3549	}
	3550
	3551	return 0;
	3552	}
	3553
b17446df JT	3554	/*
	3555	* Retrieves the number of blocks of the data device from
	3556	* the superblock and compares it to the actual device size,
	3557	* thus resizing the data device in case it has grown.
	3558	*
	3559	* This both copes with opening preallocated data devices in the ctr
	3560	* being followed by a resume
	3561	* -and-
	3562	* calling the resume method individually after userspace has
	3563	* grown the data device in reaction to a table event.
	3564	*/
	3565	static int pool_preresume(struct dm_target *ti)
	3566	{
	3567	int r;
24347e95	3568	bool need_commit1, need_commit2;
b17446df JT	3569	struct pool_c *pt = ti->private;
	3570	struct pool *pool = pt->pool;
	3571
	3572	/*
	3573	* Take control of the pool object.
	3574	*/
	3575	r = bind_control_target(pool, ti);
	3576	if (r)
19eb1650	3577	goto out;
b17446df JT	3578
	3579	r = maybe_resize_data_dev(ti, &need_commit1);
	3580	if (r)
19eb1650	3581	goto out;
b17446df	3582
24347e95 JT	3583	r = maybe_resize_metadata_dev(ti, &need_commit2);
24347e95 JT	3584	if (r)
19eb1650	3585	goto out;
24347e95 JT	3586
24347e95 JT	3587	if (need_commit1 \|\| need_commit2)
020cc3b5	3588	(void) commit(pool);
19eb1650 LM	3589	out:
	3590	/*
	3591	* When a thin-pool is PM_FAIL, it cannot be rebuilt if
	3592	* bio is in deferred list. Therefore need to return 0
	3593	* to allow pool_resume() to flush IO.
	3594	*/
	3595	if (r && get_pool_mode(pool) == PM_FAIL)
	3596	r = 0;
b17446df	3597
19eb1650	3598	return r;
b17446df JT	3599	}
b17446df JT	3600
583024d2 MS	3601	static void pool_suspend_active_thins(struct pool *pool)
	3602	{
	3603	struct thin_c *tc;
	3604
	3605	/* Suspend all active thin devices */
	3606	tc = get_first_thin(pool);
	3607	while (tc) {
	3608	dm_internal_suspend_noflush(tc->thin_md);
	3609	tc = get_next_thin(pool, tc);
	3610	}
	3611	}
	3612
	3613	static void pool_resume_active_thins(struct pool *pool)
	3614	{
	3615	struct thin_c *tc;
	3616
	3617	/* Resume all active thin devices */
	3618	tc = get_first_thin(pool);
	3619	while (tc) {
	3620	dm_internal_resume(tc->thin_md);
	3621	tc = get_next_thin(pool, tc);
	3622	}
	3623	}
	3624
991d9fa0 JT	3625	static void pool_resume(struct dm_target *ti)
	3626	{
	3627	struct pool_c *pt = ti->private;
	3628	struct pool *pool = pt->pool;
991d9fa0	3629
583024d2 MS	3630	/*
	3631	* Must requeue active_thins' bios and then resume
	3632	* active_thins _before_ clearing 'suspend' flag.
	3633	*/
	3634	requeue_bios(pool);
	3635	pool_resume_active_thins(pool);
	3636
8e0c9dac	3637	spin_lock_irq(&pool->lock);
88a6621b	3638	pool->low_water_triggered = false;
80e96c54	3639	pool->suspended = false;
8e0c9dac	3640	spin_unlock_irq(&pool->lock);
80e96c54	3641
905e51b3	3642	do_waker(&pool->waker.work);
991d9fa0 JT	3643	}
991d9fa0 JT	3644
80e96c54 MS	3645	static void pool_presuspend(struct dm_target *ti)
	3646	{
	3647	struct pool_c *pt = ti->private;
	3648	struct pool *pool = pt->pool;
80e96c54	3649
8e0c9dac	3650	spin_lock_irq(&pool->lock);
80e96c54	3651	pool->suspended = true;
8e0c9dac	3652	spin_unlock_irq(&pool->lock);
583024d2 MS	3653
583024d2 MS	3654	pool_suspend_active_thins(pool);
80e96c54 MS	3655	}
	3656
	3657	static void pool_presuspend_undo(struct dm_target *ti)
	3658	{
	3659	struct pool_c *pt = ti->private;
	3660	struct pool *pool = pt->pool;
80e96c54	3661
583024d2 MS	3662	pool_resume_active_thins(pool);
583024d2 MS	3663
8e0c9dac	3664	spin_lock_irq(&pool->lock);
80e96c54	3665	pool->suspended = false;
8e0c9dac	3666	spin_unlock_irq(&pool->lock);
80e96c54 MS	3667	}
80e96c54 MS	3668
991d9fa0 JT	3669	static void pool_postsuspend(struct dm_target *ti)
991d9fa0 JT	3670	{
991d9fa0 JT	3671	struct pool_c *pt = ti->private;
	3672	struct pool *pool = pt->pool;
	3673
18d03e8c NB	3674	cancel_delayed_work_sync(&pool->waker);
18d03e8c NB	3675	cancel_delayed_work_sync(&pool->no_space_timeout);
991d9fa0	3676	flush_workqueue(pool->wq);
020cc3b5	3677	(void) commit(pool);
991d9fa0 JT	3678	}
991d9fa0 JT	3679
86a3238c	3680	static int check_arg_count(unsigned int argc, unsigned int args_required)
991d9fa0 JT	3681	{
	3682	if (argc != args_required) {
	3683	DMWARN("Message received with %u arguments instead of %u.",
	3684	argc, args_required);
	3685	return -EINVAL;
	3686	}
	3687
	3688	return 0;
	3689	}
	3690
	3691	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
	3692	{
	3693	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
	3694	*dev_id <= MAX_DEV_ID)
	3695	return 0;
	3696
	3697	if (warning)
	3698	DMWARN("Message received with invalid device id: %s", arg);
	3699
	3700	return -EINVAL;
	3701	}
	3702
86a3238c	3703	static int process_create_thin_mesg(unsigned int argc, char *argv, struct pool pool)
991d9fa0 JT	3704	{
	3705	dm_thin_id dev_id;
	3706	int r;
	3707
	3708	r = check_arg_count(argc, 2);
	3709	if (r)
	3710	return r;
	3711
	3712	r = read_dev_id(argv[1], &dev_id, 1);
	3713	if (r)
	3714	return r;
	3715
	3716	r = dm_pool_create_thin(pool->pmd, dev_id);
	3717	if (r) {
	3718	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
	3719	argv[1]);
	3720	return r;
	3721	}
	3722
	3723	return 0;
	3724	}
	3725
86a3238c	3726	static int process_create_snap_mesg(unsigned int argc, char *argv, struct pool pool)
991d9fa0 JT	3727	{
	3728	dm_thin_id dev_id;
	3729	dm_thin_id origin_dev_id;
	3730	int r;
	3731
	3732	r = check_arg_count(argc, 3);
	3733	if (r)
	3734	return r;
	3735
	3736	r = read_dev_id(argv[1], &dev_id, 1);
	3737	if (r)
	3738	return r;
	3739
	3740	r = read_dev_id(argv[2], &origin_dev_id, 1);
	3741	if (r)
	3742	return r;
	3743
	3744	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
	3745	if (r) {
	3746	DMWARN("Creation of new snapshot %s of device %s failed.",
	3747	argv[1], argv[2]);
	3748	return r;
	3749	}
	3750
	3751	return 0;
	3752	}
	3753
86a3238c	3754	static int process_delete_mesg(unsigned int argc, char *argv, struct pool pool)
991d9fa0 JT	3755	{
	3756	dm_thin_id dev_id;
	3757	int r;
	3758
	3759	r = check_arg_count(argc, 2);
	3760	if (r)
	3761	return r;
	3762
	3763	r = read_dev_id(argv[1], &dev_id, 1);
	3764	if (r)
	3765	return r;
	3766
	3767	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
	3768	if (r)
	3769	DMWARN("Deletion of thin device %s failed.", argv[1]);
	3770
	3771	return r;
	3772	}
	3773
86a3238c	3774	static int process_set_transaction_id_mesg(unsigned int argc, char *argv, struct pool pool)
991d9fa0 JT	3775	{
	3776	dm_thin_id old_id, new_id;
	3777	int r;
	3778
	3779	r = check_arg_count(argc, 3);
	3780	if (r)
	3781	return r;
	3782
	3783	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
	3784	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
	3785	return -EINVAL;
	3786	}
	3787
	3788	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
	3789	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
	3790	return -EINVAL;
	3791	}
	3792
	3793	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
	3794	if (r) {
	3795	DMWARN("Failed to change transaction id from %s to %s.",
	3796	argv[1], argv[2]);
	3797	return r;
	3798	}
	3799
	3800	return 0;
	3801	}
	3802
86a3238c	3803	static int process_reserve_metadata_snap_mesg(unsigned int argc, char *argv, struct pool pool)
cc8394d8 JT	3804	{
	3805	int r;
	3806
	3807	r = check_arg_count(argc, 1);
	3808	if (r)
	3809	return r;
	3810
020cc3b5	3811	(void) commit(pool);
0d200aef	3812
cc8394d8 JT	3813	r = dm_pool_reserve_metadata_snap(pool->pmd);
	3814	if (r)
	3815	DMWARN("reserve_metadata_snap message failed.");
	3816
	3817	return r;
	3818	}
	3819
86a3238c	3820	static int process_release_metadata_snap_mesg(unsigned int argc, char *argv, struct pool pool)
cc8394d8 JT	3821	{
	3822	int r;
	3823
	3824	r = check_arg_count(argc, 1);
	3825	if (r)
	3826	return r;
	3827
	3828	r = dm_pool_release_metadata_snap(pool->pmd);
	3829	if (r)
	3830	DMWARN("release_metadata_snap message failed.");
	3831
	3832	return r;
	3833	}
	3834
991d9fa0 JT	3835	/*
	3836	* Messages supported:
	3837	* create_thin <dev_id>
	3838	* create_snap <dev_id> <origin_id>
	3839	* delete <dev_id>
991d9fa0	3840	* set_transaction_id <current_trans_id> <new_trans_id>
cc8394d8 JT	3841	* reserve_metadata_snap
cc8394d8 JT	3842	* release_metadata_snap
991d9fa0	3843	*/
86a3238c HM	3844	static int pool_message(struct dm_target ti, unsigned int argc, char *argv,
86a3238c HM	3845	char *result, unsigned int maxlen)
991d9fa0 JT	3846	{
	3847	int r = -EINVAL;
	3848	struct pool_c *pt = ti->private;
	3849	struct pool *pool = pt->pool;
	3850
3ab91828	3851	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
2a7eaea0 JT	3852	DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
2a7eaea0 JT	3853	dm_device_name(pool->pool_md));
fd467696	3854	return -EOPNOTSUPP;
2a7eaea0 JT	3855	}
2a7eaea0 JT	3856
991d9fa0 JT	3857	if (!strcasecmp(argv[0], "create_thin"))
	3858	r = process_create_thin_mesg(argc, argv, pool);
	3859
	3860	else if (!strcasecmp(argv[0], "create_snap"))
	3861	r = process_create_snap_mesg(argc, argv, pool);
	3862
	3863	else if (!strcasecmp(argv[0], "delete"))
	3864	r = process_delete_mesg(argc, argv, pool);
	3865
	3866	else if (!strcasecmp(argv[0], "set_transaction_id"))
	3867	r = process_set_transaction_id_mesg(argc, argv, pool);
	3868
cc8394d8 JT	3869	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
	3870	r = process_reserve_metadata_snap_mesg(argc, argv, pool);
	3871
	3872	else if (!strcasecmp(argv[0], "release_metadata_snap"))
	3873	r = process_release_metadata_snap_mesg(argc, argv, pool);
	3874
991d9fa0 JT	3875	else
	3876	DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
	3877
e49e5829	3878	if (!r)
020cc3b5	3879	(void) commit(pool);
991d9fa0 JT	3880
	3881	return r;
	3882	}
	3883
e49e5829	3884	static void emit_flags(struct pool_features pf, char result,
86a3238c	3885	unsigned int sz, unsigned int maxlen)
e49e5829	3886	{
86a3238c	3887	unsigned int count = !pf->zero_new_blocks + !pf->discard_enabled +
787a996c MS	3888	!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
787a996c MS	3889	pf->error_if_no_space;
e49e5829 JT	3890	DMEMIT("%u ", count);
	3891
	3892	if (!pf->zero_new_blocks)
	3893	DMEMIT("skip_block_zeroing ");
	3894
	3895	if (!pf->discard_enabled)
	3896	DMEMIT("ignore_discard ");
	3897
	3898	if (!pf->discard_passdown)
	3899	DMEMIT("no_discard_passdown ");
	3900
	3901	if (pf->mode == PM_READ_ONLY)
	3902	DMEMIT("read_only ");
787a996c MS	3903
	3904	if (pf->error_if_no_space)
	3905	DMEMIT("error_if_no_space ");
e49e5829 JT	3906	}
e49e5829 JT	3907
991d9fa0 JT	3908	/*
	3909	* Status line is:
	3910	* <transaction id> <used metadata sectors>/<total metadata sectors>
	3911	* <used data sectors>/<total data sectors> <held metadata root>
e4c78e21	3912	* <pool mode> <discard config> <no space config> <needs_check>
991d9fa0	3913	*/
fd7c092e	3914	static void pool_status(struct dm_target *ti, status_type_t type,
86a3238c	3915	unsigned int status_flags, char *result, unsigned int maxlen)
991d9fa0	3916	{
e49e5829	3917	int r;
86a3238c	3918	unsigned int sz = 0;
991d9fa0 JT	3919	uint64_t transaction_id;
	3920	dm_block_t nr_free_blocks_data;
	3921	dm_block_t nr_free_blocks_metadata;
	3922	dm_block_t nr_blocks_data;
	3923	dm_block_t nr_blocks_metadata;
	3924	dm_block_t held_root;
3ab91828	3925	enum pool_mode mode;
991d9fa0 JT	3926	char buf[BDEVNAME_SIZE];
	3927	char buf2[BDEVNAME_SIZE];
	3928	struct pool_c *pt = ti->private;
	3929	struct pool *pool = pt->pool;
	3930
	3931	switch (type) {
	3932	case STATUSTYPE_INFO:
e49e5829 JT	3933	if (get_pool_mode(pool) == PM_FAIL) {
	3934	DMEMIT("Fail");
	3935	break;
	3936	}
	3937
1f4e0ff0 AK	3938	/* Commit to ensure statistics aren't out-of-date */
1f4e0ff0 AK	3939	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
020cc3b5	3940	(void) commit(pool);
1f4e0ff0	3941
fd7c092e MP	3942	r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
fd7c092e MP	3943	if (r) {
4fa5971a MS	3944	DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
4fa5971a MS	3945	dm_device_name(pool->pool_md), r);
fd7c092e MP	3946	goto err;
fd7c092e MP	3947	}
991d9fa0	3948
fd7c092e MP	3949	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
fd7c092e MP	3950	if (r) {
4fa5971a MS	3951	DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
4fa5971a MS	3952	dm_device_name(pool->pool_md), r);
fd7c092e MP	3953	goto err;
fd7c092e MP	3954	}
991d9fa0 JT	3955
991d9fa0 JT	3956	r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
fd7c092e	3957	if (r) {
4fa5971a MS	3958	DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
4fa5971a MS	3959	dm_device_name(pool->pool_md), r);
fd7c092e MP	3960	goto err;
fd7c092e MP	3961	}
991d9fa0	3962
fd7c092e MP	3963	r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
fd7c092e MP	3964	if (r) {
4fa5971a MS	3965	DMERR("%s: dm_pool_get_free_block_count returned %d",
4fa5971a MS	3966	dm_device_name(pool->pool_md), r);
fd7c092e MP	3967	goto err;
fd7c092e MP	3968	}
991d9fa0 JT	3969
991d9fa0 JT	3970	r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
fd7c092e	3971	if (r) {
4fa5971a MS	3972	DMERR("%s: dm_pool_get_data_dev_size returned %d",
4fa5971a MS	3973	dm_device_name(pool->pool_md), r);
fd7c092e MP	3974	goto err;
fd7c092e MP	3975	}
991d9fa0	3976
cc8394d8	3977	r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
fd7c092e	3978	if (r) {
4fa5971a MS	3979	DMERR("%s: dm_pool_get_metadata_snap returned %d",
4fa5971a MS	3980	dm_device_name(pool->pool_md), r);
fd7c092e MP	3981	goto err;
fd7c092e MP	3982	}
991d9fa0 JT	3983
	3984	DMEMIT("%llu %llu/%llu %llu/%llu ",
	3985	(unsigned long long)transaction_id,
	3986	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
	3987	(unsigned long long)nr_blocks_metadata,
	3988	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
	3989	(unsigned long long)nr_blocks_data);
	3990
	3991	if (held_root)
e49e5829 JT	3992	DMEMIT("%llu ", held_root);
	3993	else
	3994	DMEMIT("- ");
	3995
3ab91828 JT	3996	mode = get_pool_mode(pool);
3ab91828 JT	3997	if (mode == PM_OUT_OF_DATA_SPACE)
3e1a0699	3998	DMEMIT("out_of_data_space ");
3ab91828	3999	else if (is_read_only_pool_mode(mode))
e49e5829	4000	DMEMIT("ro ");
991d9fa0	4001	else
e49e5829 JT	4002	DMEMIT("rw ");
e49e5829 JT	4003
018debea	4004	if (!pool->pf.discard_enabled)
787a996c	4005	DMEMIT("ignore_discard ");
018debea	4006	else if (pool->pf.discard_passdown)
787a996c MS	4007	DMEMIT("discard_passdown ");
	4008	else
	4009	DMEMIT("no_discard_passdown ");
	4010
	4011	if (pool->pf.error_if_no_space)
	4012	DMEMIT("error_if_no_space ");
e49e5829	4013	else
787a996c	4014	DMEMIT("queue_if_no_space ");
991d9fa0	4015
e4c78e21 MS	4016	if (dm_pool_metadata_needs_check(pool->pmd))
	4017	DMEMIT("needs_check ");
	4018	else
	4019	DMEMIT("- ");
	4020
63c8ecb6 AG	4021	DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
63c8ecb6 AG	4022
991d9fa0 JT	4023	break;
	4024
	4025	case STATUSTYPE_TABLE:
	4026	DMEMIT("%s %s %lu %llu ",
	4027	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
	4028	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
	4029	(unsigned long)pool->sectors_per_block,
	4030	(unsigned long long)pt->low_water_blocks);
0424caa1	4031	emit_flags(&pt->requested_pf, result, sz, maxlen);
991d9fa0	4032	break;
8ec45662 TS	4033
	4034	case STATUSTYPE_IMA:
	4035	*result = '\0';
	4036	break;
991d9fa0	4037	}
fd7c092e	4038	return;
991d9fa0	4039
fd7c092e MP	4040	err:
fd7c092e MP	4041	DMEMIT("Error");
991d9fa0 JT	4042	}
	4043
	4044	static int pool_iterate_devices(struct dm_target *ti,
	4045	iterate_devices_callout_fn fn, void *data)
	4046	{
	4047	struct pool_c *pt = ti->private;
	4048
	4049	return fn(ti, pt->data_dev, 0, ti->len, data);
	4050	}
	4051
991d9fa0 JT	4052	static void pool_io_hints(struct dm_target ti, struct queue_limits limits)
	4053	{
	4054	struct pool_c *pt = ti->private;
	4055	struct pool *pool = pt->pool;
604ea906 MS	4056	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
	4057
	4058	/*
d200c30e MS	4059	* If max_sectors is smaller than pool->sectors_per_block adjust it
	4060	* to the highest possible power-of-2 factor of pool->sectors_per_block.
	4061	* This is especially beneficial when the pool's data device is a RAID
	4062	* device that has a full stripe width that matches pool->sectors_per_block
	4063	* -- because even though partial RAID stripe-sized IOs will be issued to a
	4064	* single RAID stripe; when aggregated they will end on a full RAID stripe
	4065	* boundary.. which avoids additional partial RAID stripe writes cascading
604ea906	4066	*/
604ea906 MS	4067	if (limits->max_sectors < pool->sectors_per_block) {
	4068	while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
	4069	if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
	4070	limits->max_sectors--;
	4071	limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
	4072	}
604ea906	4073	}
991d9fa0	4074
0cc67cd9 MS	4075	/*
	4076	* If the system-determined stacked limits are compatible with the
	4077	* pool's blocksize (io_opt is a factor) do not override them.
	4078	*/
	4079	if (io_opt_sectors < pool->sectors_per_block \|\|
604ea906 MS	4080	!is_factor(io_opt_sectors, pool->sectors_per_block)) {
604ea906 MS	4081	if (is_factor(pool->sectors_per_block, limits->max_sectors))
0a94a469	4082	limits->io_min = limits->max_sectors << SECTOR_SHIFT;
604ea906	4083	else
0a94a469 CH	4084	limits->io_min = pool->sectors_per_block << SECTOR_SHIFT;
0a94a469 CH	4085	limits->io_opt = pool->sectors_per_block << SECTOR_SHIFT;
0cc67cd9	4086	}
0424caa1 MS	4087
	4088	/*
	4089	* pt->adjusted_pf is a staging area for the actual features to use.
	4090	* They get transferred to the live pool in bind_control_target()
	4091	* called from pool_preresume().
	4092	*/
ef6953fb MS	4093
ef6953fb MS	4094	if (pt->adjusted_pf.discard_enabled) {
fa375646 MS	4095	disable_discard_passdown_if_not_supported(pt);
fa375646 MS	4096	if (!pt->adjusted_pf.discard_passdown)
825d8bbd	4097	limits->max_hw_discard_sectors = 0;
ef6953fb MS	4098	/*
	4099	* The pool uses the same discard limits as the underlying data
	4100	* device. DM core has already set this up.
	4101	*/
	4102	} else {
b60ab990 MS	4103	/*
	4104	* Must explicitly disallow stacking discard limits otherwise the
	4105	* block layer will stack them if pool's data device has support.
b60ab990 MS	4106	*/
b60ab990 MS	4107	limits->discard_granularity = 0;
b60ab990	4108	}
991d9fa0 JT	4109	}
	4110
	4111	static struct target_type pool_target = {
	4112	.name = "thin-pool",
	4113	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
	4114	DM_TARGET_IMMUTABLE,
e2dd8aca	4115	.version = {1, 23, 0},
991d9fa0 JT	4116	.module = THIS_MODULE,
	4117	.ctr = pool_ctr,
	4118	.dtr = pool_dtr,
	4119	.map = pool_map,
80e96c54 MS	4120	.presuspend = pool_presuspend,
80e96c54 MS	4121	.presuspend_undo = pool_presuspend_undo,
991d9fa0 JT	4122	.postsuspend = pool_postsuspend,
	4123	.preresume = pool_preresume,
	4124	.resume = pool_resume,
	4125	.message = pool_message,
	4126	.status = pool_status,
991d9fa0 JT	4127	.iterate_devices = pool_iterate_devices,
	4128	.io_hints = pool_io_hints,
	4129	};
	4130
a4a82ce3 HM	4131	/*
a4a82ce3 HM	4132	*--------------------------------------------------------------
991d9fa0	4133	* Thin target methods
a4a82ce3 HM	4134	*--------------------------------------------------------------
a4a82ce3 HM	4135	*/
b10ebd34 JT	4136	static void thin_get(struct thin_c *tc)
b10ebd34 JT	4137	{
22d4c291	4138	refcount_inc(&tc->refcount);
b10ebd34 JT	4139	}
	4140
	4141	static void thin_put(struct thin_c *tc)
	4142	{
22d4c291	4143	if (refcount_dec_and_test(&tc->refcount))
b10ebd34 JT	4144	complete(&tc->can_destroy);
	4145	}
	4146
991d9fa0 JT	4147	static void thin_dtr(struct dm_target *ti)
	4148	{
	4149	struct thin_c *tc = ti->private;
c140e1c4	4150
8e0c9dac	4151	spin_lock_irq(&tc->pool->lock);
c140e1c4	4152	list_del_rcu(&tc->list);
8e0c9dac	4153	spin_unlock_irq(&tc->pool->lock);
c140e1c4	4154	synchronize_rcu();
991d9fa0	4155
17181fb7 MP	4156	thin_put(tc);
	4157	wait_for_completion(&tc->can_destroy);
	4158
991d9fa0 JT	4159	mutex_lock(&dm_thin_pool_table.mutex);
	4160
	4161	__pool_dec(tc->pool);
	4162	dm_pool_close_thin_device(tc->td);
	4163	dm_put_device(ti, tc->pool_dev);
2dd9c257 JT	4164	if (tc->origin_dev)
2dd9c257 JT	4165	dm_put_device(ti, tc->origin_dev);
991d9fa0 JT	4166	kfree(tc);
	4167
	4168	mutex_unlock(&dm_thin_pool_table.mutex);
	4169	}
	4170
	4171	/*
	4172	* Thin target parameters:
	4173	*
2dd9c257	4174	* <pool_dev> <dev_id> [origin_dev]
991d9fa0 JT	4175	*
	4176	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
	4177	* dev_id: the internal device identifier
2dd9c257	4178	* origin_dev: a device external to the pool that should act as the origin
67e2e2b2 JT	4179	*
	4180	* If the pool device has discards disabled, they get disabled for the thin
	4181	* device as well.
991d9fa0	4182	*/
86a3238c	4183	static int thin_ctr(struct dm_target ti, unsigned int argc, char *argv)
991d9fa0 JT	4184	{
	4185	int r;
	4186	struct thin_c *tc;
2dd9c257	4187	struct dm_dev pool_dev, origin_dev;
991d9fa0 JT	4188	struct mapped_device *pool_md;
	4189
	4190	mutex_lock(&dm_thin_pool_table.mutex);
	4191
2dd9c257	4192	if (argc != 2 && argc != 3) {
991d9fa0 JT	4193	ti->error = "Invalid argument count";
	4194	r = -EINVAL;
	4195	goto out_unlock;
	4196	}
	4197
	4198	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
	4199	if (!tc) {
	4200	ti->error = "Out of memory";
	4201	r = -ENOMEM;
	4202	goto out_unlock;
	4203	}
583024d2	4204	tc->thin_md = dm_table_get_md(ti->table);
c140e1c4	4205	spin_lock_init(&tc->lock);
a374bb21	4206	INIT_LIST_HEAD(&tc->deferred_cells);
c140e1c4 MS	4207	bio_list_init(&tc->deferred_bio_list);
c140e1c4 MS	4208	bio_list_init(&tc->retry_on_resume_list);
67324ea1	4209	tc->sort_bio_list = RB_ROOT;
991d9fa0	4210
2dd9c257	4211	if (argc == 3) {
70de2cbd JCXF	4212	if (!strcmp(argv[0], argv[2])) {
	4213	ti->error = "Error setting origin device";
	4214	r = -EINVAL;
	4215	goto bad_origin_dev;
	4216	}
	4217
05bdb996	4218	r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &origin_dev);
2dd9c257 JT	4219	if (r) {
	4220	ti->error = "Error opening origin device";
	4221	goto bad_origin_dev;
	4222	}
	4223	tc->origin_dev = origin_dev;
	4224	}
	4225
991d9fa0 JT	4226	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
	4227	if (r) {
	4228	ti->error = "Error opening pool device";
	4229	goto bad_pool_dev;
	4230	}
	4231	tc->pool_dev = pool_dev;
	4232
	4233	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
	4234	ti->error = "Invalid device id";
	4235	r = -EINVAL;
	4236	goto bad_common;
	4237	}
	4238
	4239	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
	4240	if (!pool_md) {
	4241	ti->error = "Couldn't get pool mapped device";
	4242	r = -EINVAL;
	4243	goto bad_common;
	4244	}
	4245
	4246	tc->pool = __pool_table_lookup(pool_md);
	4247	if (!tc->pool) {
	4248	ti->error = "Couldn't find pool object";
	4249	r = -EINVAL;
	4250	goto bad_pool_lookup;
	4251	}
	4252	__pool_inc(tc->pool);
	4253
e49e5829 JT	4254	if (get_pool_mode(tc->pool) == PM_FAIL) {
e49e5829 JT	4255	ti->error = "Couldn't open thin device, Pool is in fail mode";
1acacc07	4256	r = -EINVAL;
80e96c54	4257	goto bad_pool;
e49e5829 JT	4258	}
e49e5829 JT	4259
991d9fa0 JT	4260	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
	4261	if (r) {
	4262	ti->error = "Couldn't open thin internal device";
80e96c54	4263	goto bad_pool;
991d9fa0 JT	4264	}
991d9fa0 JT	4265
542f9038 MS	4266	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
542f9038 MS	4267	if (r)
80e96c54	4268	goto bad;
542f9038	4269
55a62eef	4270	ti->num_flush_bios = 1;
9bbf5fee	4271	ti->limit_swap_bios = true;
16ad3d10	4272	ti->flush_supported = true;
a9251281	4273	ti->accounts_remapped_io = true;
30187e1d	4274	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
67e2e2b2 JT	4275
	4276	/* In case the pool supports discards, pass them on. */
	4277	if (tc->pool->pf.discard_enabled) {
0ac55489	4278	ti->discards_supported = true;
55a62eef	4279	ti->num_discard_bios = 1;
e2dd8aca	4280	ti->max_discard_granularity = true;
67e2e2b2	4281	}
991d9fa0	4282
991d9fa0 JT	4283	mutex_unlock(&dm_thin_pool_table.mutex);
991d9fa0 JT	4284
8e0c9dac	4285	spin_lock_irq(&tc->pool->lock);
80e96c54	4286	if (tc->pool->suspended) {
8e0c9dac	4287	spin_unlock_irq(&tc->pool->lock);
80e96c54 MS	4288	mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
	4289	ti->error = "Unable to activate thin device while pool is suspended";
	4290	r = -EINVAL;
	4291	goto bad;
	4292	}
22d4c291	4293	refcount_set(&tc->refcount, 1);
2b94e896	4294	init_completion(&tc->can_destroy);
c140e1c4	4295	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
8e0c9dac	4296	spin_unlock_irq(&tc->pool->lock);
c140e1c4 MS	4297	/*
	4298	* This synchronize_rcu() call is needed here otherwise we risk a
	4299	* wake_worker() call finding no bios to process (because the newly
	4300	* added tc isn't yet visible). So this reduces latency since we
	4301	* aren't then dependent on the periodic commit to wake_worker().
	4302	*/
	4303	synchronize_rcu();
	4304
80e96c54 MS	4305	dm_put(pool_md);
80e96c54 MS	4306
991d9fa0 JT	4307	return 0;
991d9fa0 JT	4308
80e96c54	4309	bad:
1acacc07	4310	dm_pool_close_thin_device(tc->td);
80e96c54	4311	bad_pool:
991d9fa0 JT	4312	__pool_dec(tc->pool);
	4313	bad_pool_lookup:
	4314	dm_put(pool_md);
	4315	bad_common:
	4316	dm_put_device(ti, tc->pool_dev);
	4317	bad_pool_dev:
2dd9c257 JT	4318	if (tc->origin_dev)
	4319	dm_put_device(ti, tc->origin_dev);
	4320	bad_origin_dev:
991d9fa0 JT	4321	kfree(tc);
	4322	out_unlock:
	4323	mutex_unlock(&dm_thin_pool_table.mutex);
	4324
	4325	return r;
	4326	}
	4327
7de3ee57	4328	static int thin_map(struct dm_target ti, struct bio bio)
991d9fa0	4329	{
4f024f37	4330	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
991d9fa0	4331
7de3ee57	4332	return thin_bio_map(ti, bio);
991d9fa0 JT	4333	}
991d9fa0 JT	4334
4e4cbee9 CH	4335	static int thin_endio(struct dm_target ti, struct bio bio,
4e4cbee9 CH	4336	blk_status_t *err)
eb2aa48d JT	4337	{
eb2aa48d JT	4338	unsigned long flags;
59c3d2c6	4339	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d	4340	struct list_head work;
a24c2569	4341	struct dm_thin_new_mapping m, tmp;
eb2aa48d JT	4342	struct pool *pool = h->tc->pool;
	4343
	4344	if (h->shared_read_entry) {
	4345	INIT_LIST_HEAD(&work);
44feb387	4346	dm_deferred_entry_dec(h->shared_read_entry, &work);
eb2aa48d JT	4347
	4348	spin_lock_irqsave(&pool->lock, flags);
	4349	list_for_each_entry_safe(m, tmp, &work, list) {
	4350	list_del(&m->list);
50f3c3ef	4351	__complete_mapping_preparation(m);
eb2aa48d JT	4352	}
	4353	spin_unlock_irqrestore(&pool->lock, flags);
	4354	}
	4355
104655fd JT	4356	if (h->all_io_entry) {
104655fd JT	4357	INIT_LIST_HEAD(&work);
44feb387	4358	dm_deferred_entry_dec(h->all_io_entry, &work);
563af186 JT	4359	if (!list_empty(&work)) {
	4360	spin_lock_irqsave(&pool->lock, flags);
	4361	list_for_each_entry_safe(m, tmp, &work, list)
daec338b	4362	list_add_tail(&m->list, &pool->prepared_discards);
563af186 JT	4363	spin_unlock_irqrestore(&pool->lock, flags);
	4364	wake_worker(pool);
	4365	}
104655fd JT	4366	}
104655fd JT	4367
34fbcf62 JT	4368	if (h->cell)
	4369	cell_defer_no_holder(h->tc, h->cell);
	4370
1be56909	4371	return DM_ENDIO_DONE;
eb2aa48d JT	4372	}
eb2aa48d JT	4373
738211f7	4374	static void thin_presuspend(struct dm_target *ti)
991d9fa0	4375	{
738211f7 JT	4376	struct thin_c *tc = ti->private;
738211f7 JT	4377
991d9fa0	4378	if (dm_noflush_suspending(ti))
738211f7 JT	4379	noflush_work(tc, do_noflush_start);
	4380	}
	4381
	4382	static void thin_postsuspend(struct dm_target *ti)
	4383	{
	4384	struct thin_c *tc = ti->private;
	4385
	4386	/*
	4387	* The dm_noflush_suspending flag has been cleared by now, so
	4388	* unfortunately we must always run this.
	4389	*/
	4390	noflush_work(tc, do_noflush_stop);
991d9fa0 JT	4391	}
991d9fa0 JT	4392
e5aea7b4 JT	4393	static int thin_preresume(struct dm_target *ti)
	4394	{
	4395	struct thin_c *tc = ti->private;
	4396
	4397	if (tc->origin_dev)
	4398	tc->origin_size = get_dev_size(tc->origin_dev->bdev);
	4399
	4400	return 0;
	4401	}
	4402
991d9fa0 JT	4403	/*
	4404	* <nr mapped sectors> <highest mapped sector>
	4405	*/
fd7c092e	4406	static void thin_status(struct dm_target *ti, status_type_t type,
86a3238c	4407	unsigned int status_flags, char *result, unsigned int maxlen)
991d9fa0 JT	4408	{
	4409	int r;
	4410	ssize_t sz = 0;
	4411	dm_block_t mapped, highest;
	4412	char buf[BDEVNAME_SIZE];
	4413	struct thin_c *tc = ti->private;
	4414
e49e5829 JT	4415	if (get_pool_mode(tc->pool) == PM_FAIL) {
e49e5829 JT	4416	DMEMIT("Fail");
fd7c092e	4417	return;
e49e5829 JT	4418	}
e49e5829 JT	4419
991d9fa0 JT	4420	if (!tc->td)
	4421	DMEMIT("-");
	4422	else {
	4423	switch (type) {
	4424	case STATUSTYPE_INFO:
	4425	r = dm_thin_get_mapped_count(tc->td, &mapped);
fd7c092e MP	4426	if (r) {
	4427	DMERR("dm_thin_get_mapped_count returned %d", r);
	4428	goto err;
	4429	}
991d9fa0 JT	4430
991d9fa0 JT	4431	r = dm_thin_get_highest_mapped_block(tc->td, &highest);
fd7c092e MP	4432	if (r < 0) {
	4433	DMERR("dm_thin_get_highest_mapped_block returned %d", r);
	4434	goto err;
	4435	}
991d9fa0 JT	4436
	4437	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
	4438	if (r)
	4439	DMEMIT("%llu", ((highest + 1) *
	4440	tc->pool->sectors_per_block) - 1);
	4441	else
	4442	DMEMIT("-");
	4443	break;
	4444
	4445	case STATUSTYPE_TABLE:
	4446	DMEMIT("%s %lu",
	4447	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
	4448	(unsigned long) tc->dev_id);
2dd9c257 JT	4449	if (tc->origin_dev)
2dd9c257 JT	4450	DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
991d9fa0	4451	break;
8ec45662 TS	4452
	4453	case STATUSTYPE_IMA:
	4454	*result = '\0';
	4455	break;
991d9fa0 JT	4456	}
	4457	}
	4458
fd7c092e MP	4459	return;
	4460
	4461	err:
	4462	DMEMIT("Error");
991d9fa0 JT	4463	}
	4464
	4465	static int thin_iterate_devices(struct dm_target *ti,
	4466	iterate_devices_callout_fn fn, void *data)
	4467	{
55f2b8bd	4468	sector_t blocks;
991d9fa0	4469	struct thin_c *tc = ti->private;
55f2b8bd	4470	struct pool *pool = tc->pool;
991d9fa0 JT	4471
	4472	/*
	4473	* We can't call dm_pool_get_data_dev_size() since that blocks. So
	4474	* we follow a more convoluted path through to the pool's target.
	4475	*/
55f2b8bd	4476	if (!pool->ti)
991d9fa0 JT	4477	return 0; /* nothing is bound */
991d9fa0 JT	4478
55f2b8bd MS	4479	blocks = pool->ti->len;
55f2b8bd MS	4480	(void) sector_div(blocks, pool->sectors_per_block);
991d9fa0	4481	if (blocks)
55f2b8bd	4482	return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
991d9fa0 JT	4483
	4484	return 0;
	4485	}
	4486
34fbcf62 JT	4487	static void thin_io_hints(struct dm_target ti, struct queue_limits limits)
	4488	{
	4489	struct thin_c *tc = ti->private;
	4490	struct pool *pool = tc->pool;
21607670	4491
ef6953fb MS	4492	if (pool->pf.discard_enabled) {
ef6953fb MS	4493	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
825d8bbd	4494	limits->max_hw_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
ef6953fb	4495	}
34fbcf62 JT	4496	}
34fbcf62 JT	4497
991d9fa0 JT	4498	static struct target_type thin_target = {
991d9fa0 JT	4499	.name = "thin",
e2dd8aca	4500	.version = {1, 23, 0},
991d9fa0 JT	4501	.module = THIS_MODULE,
	4502	.ctr = thin_ctr,
	4503	.dtr = thin_dtr,
	4504	.map = thin_map,
eb2aa48d	4505	.end_io = thin_endio,
e5aea7b4	4506	.preresume = thin_preresume,
738211f7	4507	.presuspend = thin_presuspend,
991d9fa0 JT	4508	.postsuspend = thin_postsuspend,
	4509	.status = thin_status,
	4510	.iterate_devices = thin_iterate_devices,
34fbcf62	4511	.io_hints = thin_io_hints,
991d9fa0 JT	4512	};
	4513
	4514	/----------------------------------------------------------------/
	4515
	4516	static int __init dm_thin_init(void)
	4517	{
7e6358d2	4518	int r = -ENOMEM;
991d9fa0 JT	4519
	4520	pool_table_init();
	4521
7e6358d2	4522	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
	4523	if (!_new_mapping_cache)
	4524	return r;
	4525
991d9fa0 JT	4526	r = dm_register_target(&thin_target);
991d9fa0 JT	4527	if (r)
7e6358d2	4528	goto bad_new_mapping_cache;
991d9fa0 JT	4529
	4530	r = dm_register_target(&pool_target);
	4531	if (r)
7e6358d2	4532	goto bad_thin_target;
a24c2569	4533
a24c2569 MS	4534	return 0;
a24c2569 MS	4535
7e6358d2	4536	bad_thin_target:
a24c2569	4537	dm_unregister_target(&thin_target);
7e6358d2	4538	bad_new_mapping_cache:
7e6358d2	4539	kmem_cache_destroy(_new_mapping_cache);
991d9fa0 JT	4540
	4541	return r;
	4542	}
	4543
	4544	static void dm_thin_exit(void)
	4545	{
	4546	dm_unregister_target(&thin_target);
	4547	dm_unregister_target(&pool_target);
a24c2569	4548
a24c2569	4549	kmem_cache_destroy(_new_mapping_cache);
d5ffebdd MS	4550
d5ffebdd MS	4551	pool_table_exit();
991d9fa0 JT	4552	}
	4553
	4554	module_init(dm_thin_init);
	4555	module_exit(dm_thin_exit);
	4556
6a808034	4557	module_param_named(no_space_timeout, no_space_timeout_secs, uint, 0644);
80c57893 MS	4558	MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
80c57893 MS	4559
7cab8bf1	4560	MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
fa34e589	4561	MODULE_AUTHOR("Joe Thornber <[email protected]>");
991d9fa0	4562	MODULE_LICENSE("GPL");