Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (C) 2001 Jens Axboe <[email protected]>
	3	*
	4	* This program is free software; you can redistribute it and/or modify
	5	* it under the terms of the GNU General Public License version 2 as
	6	* published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope that it will be useful,
	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	* GNU General Public License for more details.
	12	*
	13	* You should have received a copy of the GNU General Public Licens
	14	* along with this program; if not, write to the Free Software
	15	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
	16	*
	17	*/
	18	#include <linux/mm.h>
	19	#include <linux/swap.h>
	20	#include <linux/bio.h>
	21	#include <linux/blkdev.h>
	22	#include <linux/iocontext.h>
	23	#include <linux/slab.h>
	24	#include <linux/init.h>
	25	#include <linux/kernel.h>
	26	#include <linux/export.h>
	27	#include <linux/mempool.h>
	28	#include <linux/workqueue.h>
	29	#include <linux/cgroup.h>
	30	#include <scsi/sg.h> /* for struct sg_iovec */
	31
	32	#include <trace/events/block.h>
	33
	34	/*
	35	* Test patch to inline a certain number of bi_io_vec's inside the bio
	36	* itself, to shrink a bio data allocation from two mempool calls to one
	37	*/
	38	#define BIO_INLINE_VECS 4
	39
	40	static mempool_t *bio_split_pool __read_mostly;
	41
	42	/*
	43	* if you change this list, also change bvec_alloc or things will
	44	* break badly! cannot be bigger than what you can fit into an
	45	* unsigned short
	46	*/
	47	#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
	48	static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
	49	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
	50	};
	51	#undef BV
	52
	53	/*
	54	* fs_bio_set is the bio_set containing bio and iovec memory pools used by
	55	* IO code that does not need private memory pools.
	56	*/
	57	struct bio_set *fs_bio_set;
	58	EXPORT_SYMBOL(fs_bio_set);
	59
	60	/*
	61	* Our slab pool management
	62	*/
	63	struct bio_slab {
	64	struct kmem_cache *slab;
	65	unsigned int slab_ref;
	66	unsigned int slab_size;
	67	char name[8];
	68	};
	69	static DEFINE_MUTEX(bio_slab_lock);
	70	static struct bio_slab *bio_slabs;
	71	static unsigned int bio_slab_nr, bio_slab_max;
	72
	73	static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
	74	{
	75	unsigned int sz = sizeof(struct bio) + extra_size;
	76	struct kmem_cache *slab = NULL;
	77	struct bio_slab bslab, new_bio_slabs;
	78	unsigned int new_bio_slab_max;
	79	unsigned int i, entry = -1;
	80
	81	mutex_lock(&bio_slab_lock);
	82
	83	i = 0;
	84	while (i < bio_slab_nr) {
	85	bslab = &bio_slabs[i];
	86
	87	if (!bslab->slab && entry == -1)
	88	entry = i;
	89	else if (bslab->slab_size == sz) {
	90	slab = bslab->slab;
	91	bslab->slab_ref++;
	92	break;
	93	}
	94	i++;
	95	}
	96
	97	if (slab)
	98	goto out_unlock;
	99
	100	if (bio_slab_nr == bio_slab_max && entry == -1) {
	101	new_bio_slab_max = bio_slab_max << 1;
	102	new_bio_slabs = krealloc(bio_slabs,
	103	new_bio_slab_max * sizeof(struct bio_slab),
	104	GFP_KERNEL);
	105	if (!new_bio_slabs)
	106	goto out_unlock;
	107	bio_slab_max = new_bio_slab_max;
	108	bio_slabs = new_bio_slabs;
	109	}
	110	if (entry == -1)
	111	entry = bio_slab_nr++;
	112
	113	bslab = &bio_slabs[entry];
	114
	115	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
	116	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
	117	if (!slab)
	118	goto out_unlock;
	119
	120	printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
	121	bslab->slab = slab;
	122	bslab->slab_ref = 1;
	123	bslab->slab_size = sz;
	124	out_unlock:
	125	mutex_unlock(&bio_slab_lock);
	126	return slab;
	127	}
	128
	129	static void bio_put_slab(struct bio_set *bs)
	130	{
	131	struct bio_slab *bslab = NULL;
	132	unsigned int i;
	133
	134	mutex_lock(&bio_slab_lock);
	135
	136	for (i = 0; i < bio_slab_nr; i++) {
	137	if (bs->bio_slab == bio_slabs[i].slab) {
	138	bslab = &bio_slabs[i];
	139	break;
	140	}
	141	}
	142
	143	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
	144	goto out;
	145
	146	WARN_ON(!bslab->slab_ref);
	147
	148	if (--bslab->slab_ref)
	149	goto out;
	150
	151	kmem_cache_destroy(bslab->slab);
	152	bslab->slab = NULL;
	153
	154	out:
	155	mutex_unlock(&bio_slab_lock);
	156	}
	157
	158	unsigned int bvec_nr_vecs(unsigned short idx)
	159	{
	160	return bvec_slabs[idx].nr_vecs;
	161	}
	162
	163	void bvec_free_bs(struct bio_set bs, struct bio_vec bv, unsigned int idx)
	164	{
	165	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
	166
	167	if (idx == BIOVEC_MAX_IDX)
	168	mempool_free(bv, bs->bvec_pool);
	169	else {
	170	struct biovec_slab *bvs = bvec_slabs + idx;
	171
	172	kmem_cache_free(bvs->slab, bv);
	173	}
	174	}
	175
	176	struct bio_vec bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long idx,
	177	struct bio_set *bs)
	178	{
	179	struct bio_vec *bvl;
	180
	181	/*
	182	* see comment near bvec_array define!
	183	*/
	184	switch (nr) {
	185	case 1:
	186	*idx = 0;
	187	break;
	188	case 2 ... 4:
	189	*idx = 1;
	190	break;
	191	case 5 ... 16:
	192	*idx = 2;
	193	break;
	194	case 17 ... 64:
	195	*idx = 3;
	196	break;
	197	case 65 ... 128:
	198	*idx = 4;
	199	break;
	200	case 129 ... BIO_MAX_PAGES:
	201	*idx = 5;
	202	break;
	203	default:
	204	return NULL;
	205	}
	206
	207	/*
	208	* idx now points to the pool we want to allocate from. only the
	209	* 1-vec entry pool is mempool backed.
	210	*/
	211	if (*idx == BIOVEC_MAX_IDX) {
	212	fallback:
	213	bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
	214	} else {
	215	struct biovec_slab bvs = bvec_slabs + idx;
	216	gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT \| __GFP_IO);
	217
	218	/*
	219	* Make this allocation restricted and don't dump info on
	220	* allocation failures, since we'll fallback to the mempool
	221	* in case of failure.
	222	*/
	223	__gfp_mask \|= __GFP_NOMEMALLOC \| __GFP_NORETRY \| __GFP_NOWARN;
	224
	225	/*
	226	* Try a slab allocation. If this fails and __GFP_WAIT
	227	* is set, retry with the 1-entry mempool
	228	*/
	229	bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
	230	if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
	231	*idx = BIOVEC_MAX_IDX;
	232	goto fallback;
	233	}
	234	}
	235
	236	return bvl;
	237	}
	238
	239	static void __bio_free(struct bio *bio)
	240	{
	241	bio_disassociate_task(bio);
	242
	243	if (bio_integrity(bio))
	244	bio_integrity_free(bio);
	245	}
	246
	247	static void bio_free(struct bio *bio)
	248	{
	249	struct bio_set *bs = bio->bi_pool;
	250	void *p;
	251
	252	__bio_free(bio);
	253
	254	if (bs) {
	255	if (bio_has_allocated_vec(bio))
	256	bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
	257
	258	/*
	259	* If we have front padding, adjust the bio pointer before freeing
	260	*/
	261	p = bio;
	262	p -= bs->front_pad;
	263
	264	mempool_free(p, bs->bio_pool);
	265	} else {
	266	/* Bio was allocated by bio_kmalloc() */
	267	kfree(bio);
	268	}
	269	}
	270
	271	void bio_init(struct bio *bio)
	272	{
	273	memset(bio, 0, sizeof(*bio));
	274	bio->bi_flags = 1 << BIO_UPTODATE;
	275	atomic_set(&bio->bi_cnt, 1);
	276	}
	277	EXPORT_SYMBOL(bio_init);
	278
	279	/**
	280	* bio_reset - reinitialize a bio
	281	* @bio: bio to reset
	282	*
	283	* Description:
	284	* After calling bio_reset(), @bio will be in the same state as a freshly
	285	* allocated bio returned bio bio_alloc_bioset() - the only fields that are
	286	* preserved are the ones that are initialized by bio_alloc_bioset(). See
	287	* comment in struct bio.
	288	*/
	289	void bio_reset(struct bio *bio)
	290	{
	291	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
	292
	293	__bio_free(bio);
	294
	295	memset(bio, 0, BIO_RESET_BYTES);
	296	bio->bi_flags = flags\|(1 << BIO_UPTODATE);
	297	}
	298	EXPORT_SYMBOL(bio_reset);
	299
	300	/**
	301	* bio_alloc_bioset - allocate a bio for I/O
	302	* @gfp_mask: the GFP_ mask given to the slab allocator
	303	* @nr_iovecs: number of iovecs to pre-allocate
	304	* @bs: the bio_set to allocate from.
	305	*
	306	* Description:
	307	* If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
	308	* backed by the @bs's mempool.
	309	*
	310	* When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
	311	* able to allocate a bio. This is due to the mempool guarantees. To make this
	312	* work, callers must never allocate more than 1 bio at a time from this pool.
	313	* Callers that need to allocate more than 1 bio must always submit the
	314	* previously allocated bio for IO before attempting to allocate a new one.
	315	* Failure to do so can cause deadlocks under memory pressure.
	316	*
	317	* RETURNS:
	318	* Pointer to new bio on success, NULL on failure.
	319	*/
	320	struct bio bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set bs)
	321	{
	322	unsigned front_pad;
	323	unsigned inline_vecs;
	324	unsigned long idx = BIO_POOL_NONE;
	325	struct bio_vec *bvl = NULL;
	326	struct bio *bio;
	327	void *p;
	328
	329	if (!bs) {
	330	if (nr_iovecs > UIO_MAXIOV)
	331	return NULL;
	332
	333	p = kmalloc(sizeof(struct bio) +
	334	nr_iovecs * sizeof(struct bio_vec),
	335	gfp_mask);
	336	front_pad = 0;
	337	inline_vecs = nr_iovecs;
	338	} else {
	339	p = mempool_alloc(bs->bio_pool, gfp_mask);
	340	front_pad = bs->front_pad;
	341	inline_vecs = BIO_INLINE_VECS;
	342	}
	343
	344	if (unlikely(!p))
	345	return NULL;
	346
	347	bio = p + front_pad;
	348	bio_init(bio);
	349
	350	if (nr_iovecs > inline_vecs) {
	351	bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
	352	if (unlikely(!bvl))
	353	goto err_free;
	354	} else if (nr_iovecs) {
	355	bvl = bio->bi_inline_vecs;
	356	}
	357
	358	bio->bi_pool = bs;
	359	bio->bi_flags \|= idx << BIO_POOL_OFFSET;
	360	bio->bi_max_vecs = nr_iovecs;
	361	bio->bi_io_vec = bvl;
	362	return bio;
	363
	364	err_free:
	365	mempool_free(p, bs->bio_pool);
	366	return NULL;
	367	}
	368	EXPORT_SYMBOL(bio_alloc_bioset);
	369
	370	void zero_fill_bio(struct bio *bio)
	371	{
	372	unsigned long flags;
	373	struct bio_vec *bv;
	374	int i;
	375
	376	bio_for_each_segment(bv, bio, i) {
	377	char *data = bvec_kmap_irq(bv, &flags);
	378	memset(data, 0, bv->bv_len);
	379	flush_dcache_page(bv->bv_page);
	380	bvec_kunmap_irq(data, &flags);
	381	}
	382	}
	383	EXPORT_SYMBOL(zero_fill_bio);
	384
	385	/**
	386	* bio_put - release a reference to a bio
	387	* @bio: bio to release reference to
	388	*
	389	* Description:
	390	* Put a reference to a &struct bio, either one you have gotten with
	391	* bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
	392	**/
	393	void bio_put(struct bio *bio)
	394	{
	395	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
	396
	397	/*
	398	* last put frees it
	399	*/
	400	if (atomic_dec_and_test(&bio->bi_cnt))
	401	bio_free(bio);
	402	}
	403	EXPORT_SYMBOL(bio_put);
	404
	405	inline int bio_phys_segments(struct request_queue q, struct bio bio)
	406	{
	407	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
	408	blk_recount_segments(q, bio);
	409
	410	return bio->bi_phys_segments;
	411	}
	412	EXPORT_SYMBOL(bio_phys_segments);
	413
	414	/**
	415	* __bio_clone - clone a bio
	416	* @bio: destination bio
	417	* @bio_src: bio to clone
	418	*
	419	* Clone a &bio. Caller will own the returned bio, but not
	420	* the actual data it points to. Reference count of returned
	421	* bio will be one.
	422	*/
	423	void __bio_clone(struct bio bio, struct bio bio_src)
	424	{
	425	memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
	426	bio_src->bi_max_vecs * sizeof(struct bio_vec));
	427
	428	/*
	429	* most users will be overriding ->bi_bdev with a new target,
	430	* so we don't set nor calculate new physical/hw segment counts here
	431	*/
	432	bio->bi_sector = bio_src->bi_sector;
	433	bio->bi_bdev = bio_src->bi_bdev;
	434	bio->bi_flags \|= 1 << BIO_CLONED;
	435	bio->bi_rw = bio_src->bi_rw;
	436	bio->bi_vcnt = bio_src->bi_vcnt;
	437	bio->bi_size = bio_src->bi_size;
	438	bio->bi_idx = bio_src->bi_idx;
	439	}
	440	EXPORT_SYMBOL(__bio_clone);
	441
	442	/**
	443	* bio_clone_bioset - clone a bio
	444	* @bio: bio to clone
	445	* @gfp_mask: allocation priority
	446	* @bs: bio_set to allocate from
	447	*
	448	* Like __bio_clone, only also allocates the returned bio
	449	*/
	450	struct bio bio_clone_bioset(struct bio bio, gfp_t gfp_mask,
	451	struct bio_set *bs)
	452	{
	453	struct bio *b;
	454
	455	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
	456	if (!b)
	457	return NULL;
	458
	459	__bio_clone(b, bio);
	460
	461	if (bio_integrity(bio)) {
	462	int ret;
	463
	464	ret = bio_integrity_clone(b, bio, gfp_mask);
	465
	466	if (ret < 0) {
	467	bio_put(b);
	468	return NULL;
	469	}
	470	}
	471
	472	return b;
	473	}
	474	EXPORT_SYMBOL(bio_clone_bioset);
	475
	476	/**
	477	* bio_get_nr_vecs - return approx number of vecs
	478	* @bdev: I/O target
	479	*
	480	* Return the approximate number of pages we can send to this target.
	481	* There's no guarantee that you will be able to fit this number of pages
	482	* into a bio, it does not account for dynamic restrictions that vary
	483	* on offset.
	484	*/
	485	int bio_get_nr_vecs(struct block_device *bdev)
	486	{
	487	struct request_queue *q = bdev_get_queue(bdev);
	488	int nr_pages;
	489
	490	nr_pages = min_t(unsigned,
	491	queue_max_segments(q),
	492	queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
	493
	494	return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
	495
	496	}
	497	EXPORT_SYMBOL(bio_get_nr_vecs);
	498
	499	static int __bio_add_page(struct request_queue q, struct bio bio, struct page
	500	*page, unsigned int len, unsigned int offset,
	501	unsigned short max_sectors)
	502	{
	503	int retried_segments = 0;
	504	struct bio_vec *bvec;
	505
	506	/*
	507	* cloned bio must not modify vec list
	508	*/
	509	if (unlikely(bio_flagged(bio, BIO_CLONED)))
	510	return 0;
	511
	512	if (((bio->bi_size + len) >> 9) > max_sectors)
	513	return 0;
	514
	515	/*
	516	* For filesystems with a blocksize smaller than the pagesize
	517	* we will often be called with the same page as last time and
	518	* a consecutive offset. Optimize this special case.
	519	*/
	520	if (bio->bi_vcnt > 0) {
	521	struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
	522
	523	if (page == prev->bv_page &&
	524	offset == prev->bv_offset + prev->bv_len) {
	525	unsigned int prev_bv_len = prev->bv_len;
	526	prev->bv_len += len;
	527
	528	if (q->merge_bvec_fn) {
	529	struct bvec_merge_data bvm = {
	530	/* prev_bvec is already charged in
	531	bi_size, discharge it in order to
	532	simulate merging updated prev_bvec
	533	as new bvec. */
	534	.bi_bdev = bio->bi_bdev,
	535	.bi_sector = bio->bi_sector,
	536	.bi_size = bio->bi_size - prev_bv_len,
	537	.bi_rw = bio->bi_rw,
	538	};
	539
	540	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
	541	prev->bv_len -= len;
	542	return 0;
	543	}
	544	}
	545
	546	goto done;
	547	}
	548	}
	549
	550	if (bio->bi_vcnt >= bio->bi_max_vecs)
	551	return 0;
	552
	553	/*
	554	* we might lose a segment or two here, but rather that than
	555	* make this too complex.
	556	*/
	557
	558	while (bio->bi_phys_segments >= queue_max_segments(q)) {
	559
	560	if (retried_segments)
	561	return 0;
	562
	563	retried_segments = 1;
	564	blk_recount_segments(q, bio);
	565	}
	566
	567	/*
	568	* setup the new entry, we might clear it again later if we
	569	* cannot add the page
	570	*/
	571	bvec = &bio->bi_io_vec[bio->bi_vcnt];
	572	bvec->bv_page = page;
	573	bvec->bv_len = len;
	574	bvec->bv_offset = offset;
	575
	576	/*
	577	* if queue has other restrictions (eg varying max sector size
	578	* depending on offset), it can specify a merge_bvec_fn in the
	579	* queue to get further control
	580	*/
	581	if (q->merge_bvec_fn) {
	582	struct bvec_merge_data bvm = {
	583	.bi_bdev = bio->bi_bdev,
	584	.bi_sector = bio->bi_sector,
	585	.bi_size = bio->bi_size,
	586	.bi_rw = bio->bi_rw,
	587	};
	588
	589	/*
	590	* merge_bvec_fn() returns number of bytes it can accept
	591	* at this offset
	592	*/
	593	if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
	594	bvec->bv_page = NULL;
	595	bvec->bv_len = 0;
	596	bvec->bv_offset = 0;
	597	return 0;
	598	}
	599	}
	600
	601	/* If we may be able to merge these biovecs, force a recount */
	602	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
	603	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
	604
	605	bio->bi_vcnt++;
	606	bio->bi_phys_segments++;
	607	done:
	608	bio->bi_size += len;
	609	return len;
	610	}
	611
	612	/**
	613	* bio_add_pc_page - attempt to add page to bio
	614	* @q: the target queue
	615	* @bio: destination bio
	616	* @page: page to add
	617	* @len: vec entry length
	618	* @offset: vec entry offset
	619	*
	620	* Attempt to add a page to the bio_vec maplist. This can fail for a
	621	* number of reasons, such as the bio being full or target block device
	622	* limitations. The target block device must allow bio's up to PAGE_SIZE,
	623	* so it is always possible to add a single page to an empty bio.
	624	*
	625	* This should only be used by REQ_PC bios.
	626	*/
	627	int bio_add_pc_page(struct request_queue q, struct bio bio, struct page *page,
	628	unsigned int len, unsigned int offset)
	629	{
	630	return __bio_add_page(q, bio, page, len, offset,
	631	queue_max_hw_sectors(q));
	632	}
	633	EXPORT_SYMBOL(bio_add_pc_page);
	634
	635	/**
	636	* bio_add_page - attempt to add page to bio
	637	* @bio: destination bio
	638	* @page: page to add
	639	* @len: vec entry length
	640	* @offset: vec entry offset
	641	*
	642	* Attempt to add a page to the bio_vec maplist. This can fail for a
	643	* number of reasons, such as the bio being full or target block device
	644	* limitations. The target block device must allow bio's up to PAGE_SIZE,
	645	* so it is always possible to add a single page to an empty bio.
	646	*/
	647	int bio_add_page(struct bio bio, struct page page, unsigned int len,
	648	unsigned int offset)
	649	{
	650	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
	651	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
	652	}
	653	EXPORT_SYMBOL(bio_add_page);
	654
	655	struct bio_map_data {
	656	struct bio_vec *iovecs;
	657	struct sg_iovec *sgvecs;
	658	int nr_sgvecs;
	659	int is_our_pages;
	660	};
	661
	662	static void bio_set_map_data(struct bio_map_data bmd, struct bio bio,
	663	struct sg_iovec *iov, int iov_count,
	664	int is_our_pages)
	665	{
	666	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
	667	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
	668	bmd->nr_sgvecs = iov_count;
	669	bmd->is_our_pages = is_our_pages;
	670	bio->bi_private = bmd;
	671	}
	672
	673	static void bio_free_map_data(struct bio_map_data *bmd)
	674	{
	675	kfree(bmd->iovecs);
	676	kfree(bmd->sgvecs);
	677	kfree(bmd);
	678	}
	679
	680	static struct bio_map_data *bio_alloc_map_data(int nr_segs,
	681	unsigned int iov_count,
	682	gfp_t gfp_mask)
	683	{
	684	struct bio_map_data *bmd;
	685
	686	if (iov_count > UIO_MAXIOV)
	687	return NULL;
	688
	689	bmd = kmalloc(sizeof(*bmd), gfp_mask);
	690	if (!bmd)
	691	return NULL;
	692
	693	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
	694	if (!bmd->iovecs) {
	695	kfree(bmd);
	696	return NULL;
	697	}
	698
	699	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
	700	if (bmd->sgvecs)
	701	return bmd;
	702
	703	kfree(bmd->iovecs);
	704	kfree(bmd);
	705	return NULL;
	706	}
	707
	708	static int __bio_copy_iov(struct bio bio, struct bio_vec iovecs,
	709	struct sg_iovec *iov, int iov_count,
	710	int to_user, int from_user, int do_free_page)
	711	{
	712	int ret = 0, i;
	713	struct bio_vec *bvec;
	714	int iov_idx = 0;
	715	unsigned int iov_off = 0;
	716
	717	__bio_for_each_segment(bvec, bio, i, 0) {
	718	char *bv_addr = page_address(bvec->bv_page);
	719	unsigned int bv_len = iovecs[i].bv_len;
	720
	721	while (bv_len && iov_idx < iov_count) {
	722	unsigned int bytes;
	723	char __user *iov_addr;
	724
	725	bytes = min_t(unsigned int,
	726	iov[iov_idx].iov_len - iov_off, bv_len);
	727	iov_addr = iov[iov_idx].iov_base + iov_off;
	728
	729	if (!ret) {
	730	if (to_user)
	731	ret = copy_to_user(iov_addr, bv_addr,
	732	bytes);
	733
	734	if (from_user)
	735	ret = copy_from_user(bv_addr, iov_addr,
	736	bytes);
	737
	738	if (ret)
	739	ret = -EFAULT;
	740	}
	741
	742	bv_len -= bytes;
	743	bv_addr += bytes;
	744	iov_addr += bytes;
	745	iov_off += bytes;
	746
	747	if (iov[iov_idx].iov_len == iov_off) {
	748	iov_idx++;
	749	iov_off = 0;
	750	}
	751	}
	752
	753	if (do_free_page)
	754	__free_page(bvec->bv_page);
	755	}
	756
	757	return ret;
	758	}
	759
	760	/**
	761	* bio_uncopy_user - finish previously mapped bio
	762	* @bio: bio being terminated
	763	*
	764	* Free pages allocated from bio_copy_user() and write back data
	765	* to user space in case of a read.
	766	*/
	767	int bio_uncopy_user(struct bio *bio)
	768	{
	769	struct bio_map_data *bmd = bio->bi_private;
	770	int ret = 0;
	771
	772	if (!bio_flagged(bio, BIO_NULL_MAPPED))
	773	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
	774	bmd->nr_sgvecs, bio_data_dir(bio) == READ,
	775	0, bmd->is_our_pages);
	776	bio_free_map_data(bmd);
	777	bio_put(bio);
	778	return ret;
	779	}
	780	EXPORT_SYMBOL(bio_uncopy_user);
	781
	782	/**
	783	* bio_copy_user_iov - copy user data to bio
	784	* @q: destination block queue
	785	* @map_data: pointer to the rq_map_data holding pages (if necessary)
	786	* @iov: the iovec.
	787	* @iov_count: number of elements in the iovec
	788	* @write_to_vm: bool indicating writing to pages or not
	789	* @gfp_mask: memory allocation flags
	790	*
	791	* Prepares and returns a bio for indirect user io, bouncing data
	792	* to/from kernel pages as necessary. Must be paired with
	793	* call bio_uncopy_user() on io completion.
	794	*/
	795	struct bio bio_copy_user_iov(struct request_queue q,
	796	struct rq_map_data *map_data,
	797	struct sg_iovec *iov, int iov_count,
	798	int write_to_vm, gfp_t gfp_mask)
	799	{
	800	struct bio_map_data *bmd;
	801	struct bio_vec *bvec;
	802	struct page *page;
	803	struct bio *bio;
	804	int i, ret;
	805	int nr_pages = 0;
	806	unsigned int len = 0;
	807	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
	808
	809	for (i = 0; i < iov_count; i++) {
	810	unsigned long uaddr;
	811	unsigned long end;
	812	unsigned long start;
	813
	814	uaddr = (unsigned long)iov[i].iov_base;
	815	end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	816	start = uaddr >> PAGE_SHIFT;
	817
	818	/*
	819	* Overflow, abort
	820	*/
	821	if (end < start)
	822	return ERR_PTR(-EINVAL);
	823
	824	nr_pages += end - start;
	825	len += iov[i].iov_len;
	826	}
	827
	828	if (offset)
	829	nr_pages++;
	830
	831	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
	832	if (!bmd)
	833	return ERR_PTR(-ENOMEM);
	834
	835	ret = -ENOMEM;
	836	bio = bio_kmalloc(gfp_mask, nr_pages);
	837	if (!bio)
	838	goto out_bmd;
	839
	840	if (!write_to_vm)
	841	bio->bi_rw \|= REQ_WRITE;
	842
	843	ret = 0;
	844
	845	if (map_data) {
	846	nr_pages = 1 << map_data->page_order;
	847	i = map_data->offset / PAGE_SIZE;
	848	}
	849	while (len) {
	850	unsigned int bytes = PAGE_SIZE;
	851
	852	bytes -= offset;
	853
	854	if (bytes > len)
	855	bytes = len;
	856
	857	if (map_data) {
	858	if (i == map_data->nr_entries * nr_pages) {
	859	ret = -ENOMEM;
	860	break;
	861	}
	862
	863	page = map_data->pages[i / nr_pages];
	864	page += (i % nr_pages);
	865
	866	i++;
	867	} else {
	868	page = alloc_page(q->bounce_gfp \| gfp_mask);
	869	if (!page) {
	870	ret = -ENOMEM;
	871	break;
	872	}
	873	}
	874
	875	if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
	876	break;
	877
	878	len -= bytes;
	879	offset = 0;
	880	}
	881
	882	if (ret)
	883	goto cleanup;
	884
	885	/*
	886	* success
	887	*/
	888	if ((!write_to_vm && (!map_data \|\| !map_data->null_mapped)) \|\|
	889	(map_data && map_data->from_user)) {
	890	ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
	891	if (ret)
	892	goto cleanup;
	893	}
	894
	895	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
	896	return bio;
	897	cleanup:
	898	if (!map_data)
	899	bio_for_each_segment(bvec, bio, i)
	900	__free_page(bvec->bv_page);
	901
	902	bio_put(bio);
	903	out_bmd:
	904	bio_free_map_data(bmd);
	905	return ERR_PTR(ret);
	906	}
	907
	908	/**
	909	* bio_copy_user - copy user data to bio
	910	* @q: destination block queue
	911	* @map_data: pointer to the rq_map_data holding pages (if necessary)
	912	* @uaddr: start of user address
	913	* @len: length in bytes
	914	* @write_to_vm: bool indicating writing to pages or not
	915	* @gfp_mask: memory allocation flags
	916	*
	917	* Prepares and returns a bio for indirect user io, bouncing data
	918	* to/from kernel pages as necessary. Must be paired with
	919	* call bio_uncopy_user() on io completion.
	920	*/
	921	struct bio bio_copy_user(struct request_queue q, struct rq_map_data *map_data,
	922	unsigned long uaddr, unsigned int len,
	923	int write_to_vm, gfp_t gfp_mask)
	924	{
	925	struct sg_iovec iov;
	926
	927	iov.iov_base = (void __user *)uaddr;
	928	iov.iov_len = len;
	929
	930	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
	931	}
	932	EXPORT_SYMBOL(bio_copy_user);
	933
	934	static struct bio __bio_map_user_iov(struct request_queue q,
	935	struct block_device *bdev,
	936	struct sg_iovec *iov, int iov_count,
	937	int write_to_vm, gfp_t gfp_mask)
	938	{
	939	int i, j;
	940	int nr_pages = 0;
	941	struct page **pages;
	942	struct bio *bio;
	943	int cur_page = 0;
	944	int ret, offset;
	945
	946	for (i = 0; i < iov_count; i++) {
	947	unsigned long uaddr = (unsigned long)iov[i].iov_base;
	948	unsigned long len = iov[i].iov_len;
	949	unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	950	unsigned long start = uaddr >> PAGE_SHIFT;
	951
	952	/*
	953	* Overflow, abort
	954	*/
	955	if (end < start)
	956	return ERR_PTR(-EINVAL);
	957
	958	nr_pages += end - start;
	959	/*
	960	* buffer must be aligned to at least hardsector size for now
	961	*/
	962	if (uaddr & queue_dma_alignment(q))
	963	return ERR_PTR(-EINVAL);
	964	}
	965
	966	if (!nr_pages)
	967	return ERR_PTR(-EINVAL);
	968
	969	bio = bio_kmalloc(gfp_mask, nr_pages);
	970	if (!bio)
	971	return ERR_PTR(-ENOMEM);
	972
	973	ret = -ENOMEM;
	974	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
	975	if (!pages)
	976	goto out;
	977
	978	for (i = 0; i < iov_count; i++) {
	979	unsigned long uaddr = (unsigned long)iov[i].iov_base;
	980	unsigned long len = iov[i].iov_len;
	981	unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	982	unsigned long start = uaddr >> PAGE_SHIFT;
	983	const int local_nr_pages = end - start;
	984	const int page_limit = cur_page + local_nr_pages;
	985
	986	ret = get_user_pages_fast(uaddr, local_nr_pages,
	987	write_to_vm, &pages[cur_page]);
	988	if (ret < local_nr_pages) {
	989	ret = -EFAULT;
	990	goto out_unmap;
	991	}
	992
	993	offset = uaddr & ~PAGE_MASK;
	994	for (j = cur_page; j < page_limit; j++) {
	995	unsigned int bytes = PAGE_SIZE - offset;
	996
	997	if (len <= 0)
	998	break;
	999
	1000	if (bytes > len)
	1001	bytes = len;
	1002
	1003	/*
	1004	* sorry...
	1005	*/
	1006	if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
	1007	bytes)
	1008	break;
	1009
	1010	len -= bytes;
	1011	offset = 0;
	1012	}
	1013
	1014	cur_page = j;
	1015	/*
	1016	* release the pages we didn't map into the bio, if any
	1017	*/
	1018	while (j < page_limit)
	1019	page_cache_release(pages[j++]);
	1020	}
	1021
	1022	kfree(pages);
	1023
	1024	/*
	1025	* set data direction, and check if mapped pages need bouncing
	1026	*/
	1027	if (!write_to_vm)
	1028	bio->bi_rw \|= REQ_WRITE;
	1029
	1030	bio->bi_bdev = bdev;
	1031	bio->bi_flags \|= (1 << BIO_USER_MAPPED);
	1032	return bio;
	1033
	1034	out_unmap:
	1035	for (i = 0; i < nr_pages; i++) {
	1036	if(!pages[i])
	1037	break;
	1038	page_cache_release(pages[i]);
	1039	}
	1040	out:
	1041	kfree(pages);
	1042	bio_put(bio);
	1043	return ERR_PTR(ret);
	1044	}
	1045
	1046	/**
	1047	* bio_map_user - map user address into bio
	1048	* @q: the struct request_queue for the bio
	1049	* @bdev: destination block device
	1050	* @uaddr: start of user address
	1051	* @len: length in bytes
	1052	* @write_to_vm: bool indicating writing to pages or not
	1053	* @gfp_mask: memory allocation flags
	1054	*
	1055	* Map the user space address into a bio suitable for io to a block
	1056	* device. Returns an error pointer in case of error.
	1057	*/
	1058	struct bio bio_map_user(struct request_queue q, struct block_device *bdev,
	1059	unsigned long uaddr, unsigned int len, int write_to_vm,
	1060	gfp_t gfp_mask)
	1061	{
	1062	struct sg_iovec iov;
	1063
	1064	iov.iov_base = (void __user *)uaddr;
	1065	iov.iov_len = len;
	1066
	1067	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
	1068	}
	1069	EXPORT_SYMBOL(bio_map_user);
	1070
	1071	/**
	1072	* bio_map_user_iov - map user sg_iovec table into bio
	1073	* @q: the struct request_queue for the bio
	1074	* @bdev: destination block device
	1075	* @iov: the iovec.
	1076	* @iov_count: number of elements in the iovec
	1077	* @write_to_vm: bool indicating writing to pages or not
	1078	* @gfp_mask: memory allocation flags
	1079	*
	1080	* Map the user space address into a bio suitable for io to a block
	1081	* device. Returns an error pointer in case of error.
	1082	*/
	1083	struct bio bio_map_user_iov(struct request_queue q, struct block_device *bdev,
	1084	struct sg_iovec *iov, int iov_count,
	1085	int write_to_vm, gfp_t gfp_mask)
	1086	{
	1087	struct bio *bio;
	1088
	1089	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
	1090	gfp_mask);
	1091	if (IS_ERR(bio))
	1092	return bio;
	1093
	1094	/*
	1095	* subtle -- if __bio_map_user() ended up bouncing a bio,
	1096	* it would normally disappear when its bi_end_io is run.
	1097	* however, we need it for the unmap, so grab an extra
	1098	* reference to it
	1099	*/
	1100	bio_get(bio);
	1101
	1102	return bio;
	1103	}
	1104
	1105	static void __bio_unmap_user(struct bio *bio)
	1106	{
	1107	struct bio_vec *bvec;
	1108	int i;
	1109
	1110	/*
	1111	* make sure we dirty pages we wrote to
	1112	*/
	1113	__bio_for_each_segment(bvec, bio, i, 0) {
	1114	if (bio_data_dir(bio) == READ)
	1115	set_page_dirty_lock(bvec->bv_page);
	1116
	1117	page_cache_release(bvec->bv_page);
	1118	}
	1119
	1120	bio_put(bio);
	1121	}
	1122
	1123	/**
	1124	* bio_unmap_user - unmap a bio
	1125	* @bio: the bio being unmapped
	1126	*
	1127	* Unmap a bio previously mapped by bio_map_user(). Must be called with
	1128	* a process context.
	1129	*
	1130	* bio_unmap_user() may sleep.
	1131	*/
	1132	void bio_unmap_user(struct bio *bio)
	1133	{
	1134	__bio_unmap_user(bio);
	1135	bio_put(bio);
	1136	}
	1137	EXPORT_SYMBOL(bio_unmap_user);
	1138
	1139	static void bio_map_kern_endio(struct bio *bio, int err)
	1140	{
	1141	bio_put(bio);
	1142	}
	1143
	1144	static struct bio __bio_map_kern(struct request_queue q, void *data,
	1145	unsigned int len, gfp_t gfp_mask)
	1146	{
	1147	unsigned long kaddr = (unsigned long)data;
	1148	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	1149	unsigned long start = kaddr >> PAGE_SHIFT;
	1150	const int nr_pages = end - start;
	1151	int offset, i;
	1152	struct bio *bio;
	1153
	1154	bio = bio_kmalloc(gfp_mask, nr_pages);
	1155	if (!bio)
	1156	return ERR_PTR(-ENOMEM);
	1157
	1158	offset = offset_in_page(kaddr);
	1159	for (i = 0; i < nr_pages; i++) {
	1160	unsigned int bytes = PAGE_SIZE - offset;
	1161
	1162	if (len <= 0)
	1163	break;
	1164
	1165	if (bytes > len)
	1166	bytes = len;
	1167
	1168	if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
	1169	offset) < bytes)
	1170	break;
	1171
	1172	data += bytes;
	1173	len -= bytes;
	1174	offset = 0;
	1175	}
	1176
	1177	bio->bi_end_io = bio_map_kern_endio;
	1178	return bio;
	1179	}
	1180
	1181	/**
	1182	* bio_map_kern - map kernel address into bio
	1183	* @q: the struct request_queue for the bio
	1184	* @data: pointer to buffer to map
	1185	* @len: length in bytes
	1186	* @gfp_mask: allocation flags for bio allocation
	1187	*
	1188	* Map the kernel address into a bio suitable for io to a block
	1189	* device. Returns an error pointer in case of error.
	1190	*/
	1191	struct bio bio_map_kern(struct request_queue q, void *data, unsigned int len,
	1192	gfp_t gfp_mask)
	1193	{
	1194	struct bio *bio;
	1195
	1196	bio = __bio_map_kern(q, data, len, gfp_mask);
	1197	if (IS_ERR(bio))
	1198	return bio;
	1199
	1200	if (bio->bi_size == len)
	1201	return bio;
	1202
	1203	/*
	1204	* Don't support partial mappings.
	1205	*/
	1206	bio_put(bio);
	1207	return ERR_PTR(-EINVAL);
	1208	}
	1209	EXPORT_SYMBOL(bio_map_kern);
	1210
	1211	static void bio_copy_kern_endio(struct bio *bio, int err)
	1212	{
	1213	struct bio_vec *bvec;
	1214	const int read = bio_data_dir(bio) == READ;
	1215	struct bio_map_data *bmd = bio->bi_private;
	1216	int i;
	1217	char *p = bmd->sgvecs[0].iov_base;
	1218
	1219	__bio_for_each_segment(bvec, bio, i, 0) {
	1220	char *addr = page_address(bvec->bv_page);
	1221	int len = bmd->iovecs[i].bv_len;
	1222
	1223	if (read)
	1224	memcpy(p, addr, len);
	1225
	1226	__free_page(bvec->bv_page);
	1227	p += len;
	1228	}
	1229
	1230	bio_free_map_data(bmd);
	1231	bio_put(bio);
	1232	}
	1233
	1234	/**
	1235	* bio_copy_kern - copy kernel address into bio
	1236	* @q: the struct request_queue for the bio
	1237	* @data: pointer to buffer to copy
	1238	* @len: length in bytes
	1239	* @gfp_mask: allocation flags for bio and page allocation
	1240	* @reading: data direction is READ
	1241	*
	1242	* copy the kernel address into a bio suitable for io to a block
	1243	* device. Returns an error pointer in case of error.
	1244	*/
	1245	struct bio bio_copy_kern(struct request_queue q, void *data, unsigned int len,
	1246	gfp_t gfp_mask, int reading)
	1247	{
	1248	struct bio *bio;
	1249	struct bio_vec *bvec;
	1250	int i;
	1251
	1252	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
	1253	if (IS_ERR(bio))
	1254	return bio;
	1255
	1256	if (!reading) {
	1257	void *p = data;
	1258
	1259	bio_for_each_segment(bvec, bio, i) {
	1260	char *addr = page_address(bvec->bv_page);
	1261
	1262	memcpy(addr, p, bvec->bv_len);
	1263	p += bvec->bv_len;
	1264	}
	1265	}
	1266
	1267	bio->bi_end_io = bio_copy_kern_endio;
	1268
	1269	return bio;
	1270	}
	1271	EXPORT_SYMBOL(bio_copy_kern);
	1272
	1273	/*
	1274	* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
	1275	* for performing direct-IO in BIOs.
	1276	*
	1277	* The problem is that we cannot run set_page_dirty() from interrupt context
	1278	* because the required locks are not interrupt-safe. So what we can do is to
	1279	* mark the pages dirty _before_ performing IO. And in interrupt context,
	1280	* check that the pages are still dirty. If so, fine. If not, redirty them
	1281	* in process context.
	1282	*
	1283	* We special-case compound pages here: normally this means reads into hugetlb
	1284	* pages. The logic in here doesn't really work right for compound pages
	1285	* because the VM does not uniformly chase down the head page in all cases.
	1286	* But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
	1287	* handle them at all. So we skip compound pages here at an early stage.
	1288	*
	1289	* Note that this code is very hard to test under normal circumstances because
	1290	* direct-io pins the pages with get_user_pages(). This makes
	1291	* is_page_cache_freeable return false, and the VM will not clean the pages.
	1292	* But other code (eg, flusher threads) could clean the pages if they are mapped
	1293	* pagecache.
	1294	*
	1295	* Simply disabling the call to bio_set_pages_dirty() is a good way to test the
	1296	* deferred bio dirtying paths.
	1297	*/
	1298
	1299	/*
	1300	* bio_set_pages_dirty() will mark all the bio's pages as dirty.
	1301	*/
	1302	void bio_set_pages_dirty(struct bio *bio)
	1303	{
	1304	struct bio_vec *bvec = bio->bi_io_vec;
	1305	int i;
	1306
	1307	for (i = 0; i < bio->bi_vcnt; i++) {
	1308	struct page *page = bvec[i].bv_page;
	1309
	1310	if (page && !PageCompound(page))
	1311	set_page_dirty_lock(page);
	1312	}
	1313	}
	1314
	1315	static void bio_release_pages(struct bio *bio)
	1316	{
	1317	struct bio_vec *bvec = bio->bi_io_vec;
	1318	int i;
	1319
	1320	for (i = 0; i < bio->bi_vcnt; i++) {
	1321	struct page *page = bvec[i].bv_page;
	1322
	1323	if (page)
	1324	put_page(page);
	1325	}
	1326	}
	1327
	1328	/*
	1329	* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
	1330	* If they are, then fine. If, however, some pages are clean then they must
	1331	* have been written out during the direct-IO read. So we take another ref on
	1332	* the BIO and the offending pages and re-dirty the pages in process context.
	1333	*
	1334	* It is expected that bio_check_pages_dirty() will wholly own the BIO from
	1335	* here on. It will run one page_cache_release() against each page and will
	1336	* run one bio_put() against the BIO.
	1337	*/
	1338
	1339	static void bio_dirty_fn(struct work_struct *work);
	1340
	1341	static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
	1342	static DEFINE_SPINLOCK(bio_dirty_lock);
	1343	static struct bio *bio_dirty_list;
	1344
	1345	/*
	1346	* This runs in process context
	1347	*/
	1348	static void bio_dirty_fn(struct work_struct *work)
	1349	{
	1350	unsigned long flags;
	1351	struct bio *bio;
	1352
	1353	spin_lock_irqsave(&bio_dirty_lock, flags);
	1354	bio = bio_dirty_list;
	1355	bio_dirty_list = NULL;
	1356	spin_unlock_irqrestore(&bio_dirty_lock, flags);
	1357
	1358	while (bio) {
	1359	struct bio *next = bio->bi_private;
	1360
	1361	bio_set_pages_dirty(bio);
	1362	bio_release_pages(bio);
	1363	bio_put(bio);
	1364	bio = next;
	1365	}
	1366	}
	1367
	1368	void bio_check_pages_dirty(struct bio *bio)
	1369	{
	1370	struct bio_vec *bvec = bio->bi_io_vec;
	1371	int nr_clean_pages = 0;
	1372	int i;
	1373
	1374	for (i = 0; i < bio->bi_vcnt; i++) {
	1375	struct page *page = bvec[i].bv_page;
	1376
	1377	if (PageDirty(page) \|\| PageCompound(page)) {
	1378	page_cache_release(page);
	1379	bvec[i].bv_page = NULL;
	1380	} else {
	1381	nr_clean_pages++;
	1382	}
	1383	}
	1384
	1385	if (nr_clean_pages) {
	1386	unsigned long flags;
	1387
	1388	spin_lock_irqsave(&bio_dirty_lock, flags);
	1389	bio->bi_private = bio_dirty_list;
	1390	bio_dirty_list = bio;
	1391	spin_unlock_irqrestore(&bio_dirty_lock, flags);
	1392	schedule_work(&bio_dirty_work);
	1393	} else {
	1394	bio_put(bio);
	1395	}
	1396	}
	1397
	1398	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
	1399	void bio_flush_dcache_pages(struct bio *bi)
	1400	{
	1401	int i;
	1402	struct bio_vec *bvec;
	1403
	1404	bio_for_each_segment(bvec, bi, i)
	1405	flush_dcache_page(bvec->bv_page);
	1406	}
	1407	EXPORT_SYMBOL(bio_flush_dcache_pages);
	1408	#endif
	1409
	1410	/**
	1411	* bio_endio - end I/O on a bio
	1412	* @bio: bio
	1413	* @error: error, if any
	1414	*
	1415	* Description:
	1416	* bio_endio() will end I/O on the whole bio. bio_endio() is the
	1417	* preferred way to end I/O on a bio, it takes care of clearing
	1418	* BIO_UPTODATE on error. @error is 0 on success, and and one of the
	1419	* established -Exxxx (-EIO, for instance) error values in case
	1420	* something went wrong. No one should call bi_end_io() directly on a
	1421	* bio unless they own it and thus know that it has an end_io
	1422	* function.
	1423	**/
	1424	void bio_endio(struct bio *bio, int error)
	1425	{
	1426	if (error)
	1427	clear_bit(BIO_UPTODATE, &bio->bi_flags);
	1428	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
	1429	error = -EIO;
	1430
	1431	if (bio->bi_end_io)
	1432	bio->bi_end_io(bio, error);
	1433	}
	1434	EXPORT_SYMBOL(bio_endio);
	1435
	1436	void bio_pair_release(struct bio_pair *bp)
	1437	{
	1438	if (atomic_dec_and_test(&bp->cnt)) {
	1439	struct bio *master = bp->bio1.bi_private;
	1440
	1441	bio_endio(master, bp->error);
	1442	mempool_free(bp, bp->bio2.bi_private);
	1443	}
	1444	}
	1445	EXPORT_SYMBOL(bio_pair_release);
	1446
	1447	static void bio_pair_end_1(struct bio *bi, int err)
	1448	{
	1449	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
	1450
	1451	if (err)
	1452	bp->error = err;
	1453
	1454	bio_pair_release(bp);
	1455	}
	1456
	1457	static void bio_pair_end_2(struct bio *bi, int err)
	1458	{
	1459	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
	1460
	1461	if (err)
	1462	bp->error = err;
	1463
	1464	bio_pair_release(bp);
	1465	}
	1466
	1467	/*
	1468	* split a bio - only worry about a bio with a single page in its iovec
	1469	*/
	1470	struct bio_pair bio_split(struct bio bi, int first_sectors)
	1471	{
	1472	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
	1473
	1474	if (!bp)
	1475	return bp;
	1476
	1477	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
	1478	bi->bi_sector + first_sectors);
	1479
	1480	BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
	1481	BUG_ON(bi->bi_idx != 0);
	1482	atomic_set(&bp->cnt, 3);
	1483	bp->error = 0;
	1484	bp->bio1 = *bi;
	1485	bp->bio2 = *bi;
	1486	bp->bio2.bi_sector += first_sectors;
	1487	bp->bio2.bi_size -= first_sectors << 9;
	1488	bp->bio1.bi_size = first_sectors << 9;
	1489
	1490	if (bi->bi_vcnt != 0) {
	1491	bp->bv1 = bi->bi_io_vec[0];
	1492	bp->bv2 = bi->bi_io_vec[0];
	1493
	1494	if (bio_is_rw(bi)) {
	1495	bp->bv2.bv_offset += first_sectors << 9;
	1496	bp->bv2.bv_len -= first_sectors << 9;
	1497	bp->bv1.bv_len = first_sectors << 9;
	1498	}
	1499
	1500	bp->bio1.bi_io_vec = &bp->bv1;
	1501	bp->bio2.bi_io_vec = &bp->bv2;
	1502
	1503	bp->bio1.bi_max_vecs = 1;
	1504	bp->bio2.bi_max_vecs = 1;
	1505	}
	1506
	1507	bp->bio1.bi_end_io = bio_pair_end_1;
	1508	bp->bio2.bi_end_io = bio_pair_end_2;
	1509
	1510	bp->bio1.bi_private = bi;
	1511	bp->bio2.bi_private = bio_split_pool;
	1512
	1513	if (bio_integrity(bi))
	1514	bio_integrity_split(bi, bp, first_sectors);
	1515
	1516	return bp;
	1517	}
	1518	EXPORT_SYMBOL(bio_split);
	1519
	1520	/**
	1521	* bio_sector_offset - Find hardware sector offset in bio
	1522	* @bio: bio to inspect
	1523	* @index: bio_vec index
	1524	* @offset: offset in bv_page
	1525	*
	1526	* Return the number of hardware sectors between beginning of bio
	1527	* and an end point indicated by a bio_vec index and an offset
	1528	* within that vector's page.
	1529	*/
	1530	sector_t bio_sector_offset(struct bio *bio, unsigned short index,
	1531	unsigned int offset)
	1532	{
	1533	unsigned int sector_sz;
	1534	struct bio_vec *bv;
	1535	sector_t sectors;
	1536	int i;
	1537
	1538	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
	1539	sectors = 0;
	1540
	1541	if (index >= bio->bi_idx)
	1542	index = bio->bi_vcnt - 1;
	1543
	1544	__bio_for_each_segment(bv, bio, i, 0) {
	1545	if (i == index) {
	1546	if (offset > bv->bv_offset)
	1547	sectors += (offset - bv->bv_offset) / sector_sz;
	1548	break;
	1549	}
	1550
	1551	sectors += bv->bv_len / sector_sz;
	1552	}
	1553
	1554	return sectors;
	1555	}
	1556	EXPORT_SYMBOL(bio_sector_offset);
	1557
	1558	/*
	1559	* create memory pools for biovec's in a bio_set.
	1560	* use the global biovec slabs created for general use.
	1561	*/
	1562	static int biovec_create_pools(struct bio_set *bs, int pool_entries)
	1563	{
	1564	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
	1565
	1566	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
	1567	if (!bs->bvec_pool)
	1568	return -ENOMEM;
	1569
	1570	return 0;
	1571	}
	1572
	1573	static void biovec_free_pools(struct bio_set *bs)
	1574	{
	1575	mempool_destroy(bs->bvec_pool);
	1576	}
	1577
	1578	void bioset_free(struct bio_set *bs)
	1579	{
	1580	if (bs->bio_pool)
	1581	mempool_destroy(bs->bio_pool);
	1582
	1583	bioset_integrity_free(bs);
	1584	biovec_free_pools(bs);
	1585	bio_put_slab(bs);
	1586
	1587	kfree(bs);
	1588	}
	1589	EXPORT_SYMBOL(bioset_free);
	1590
	1591	/**
	1592	* bioset_create - Create a bio_set
	1593	* @pool_size: Number of bio and bio_vecs to cache in the mempool
	1594	* @front_pad: Number of bytes to allocate in front of the returned bio
	1595	*
	1596	* Description:
	1597	* Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
	1598	* to ask for a number of bytes to be allocated in front of the bio.
	1599	* Front pad allocation is useful for embedding the bio inside
	1600	* another structure, to avoid allocating extra data to go with the bio.
	1601	* Note that the bio must be embedded at the END of that structure always,
	1602	* or things will break badly.
	1603	*/
	1604	struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
	1605	{
	1606	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
	1607	struct bio_set *bs;
	1608
	1609	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
	1610	if (!bs)
	1611	return NULL;
	1612
	1613	bs->front_pad = front_pad;
	1614
	1615	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
	1616	if (!bs->bio_slab) {
	1617	kfree(bs);
	1618	return NULL;
	1619	}
	1620
	1621	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
	1622	if (!bs->bio_pool)
	1623	goto bad;
	1624
	1625	if (!biovec_create_pools(bs, pool_size))
	1626	return bs;
	1627
	1628	bad:
	1629	bioset_free(bs);
	1630	return NULL;
	1631	}
	1632	EXPORT_SYMBOL(bioset_create);
	1633
	1634	#ifdef CONFIG_BLK_CGROUP
	1635	/**
	1636	* bio_associate_current - associate a bio with %current
	1637	* @bio: target bio
	1638	*
	1639	* Associate @bio with %current if it hasn't been associated yet. Block
	1640	* layer will treat @bio as if it were issued by %current no matter which
	1641	* task actually issues it.
	1642	*
	1643	* This function takes an extra reference of @task's io_context and blkcg
	1644	* which will be put when @bio is released. The caller must own @bio,
	1645	* ensure %current->io_context exists, and is responsible for synchronizing
	1646	* calls to this function.
	1647	*/
	1648	int bio_associate_current(struct bio *bio)
	1649	{
	1650	struct io_context *ioc;
	1651	struct cgroup_subsys_state *css;
	1652
	1653	if (bio->bi_ioc)
	1654	return -EBUSY;
	1655
	1656	ioc = current->io_context;
	1657	if (!ioc)
	1658	return -ENOENT;
	1659
	1660	/* acquire active ref on @ioc and associate */
	1661	get_io_context_active(ioc);
	1662	bio->bi_ioc = ioc;
	1663
	1664	/* associate blkcg if exists */
	1665	rcu_read_lock();
	1666	css = task_subsys_state(current, blkio_subsys_id);
	1667	if (css && css_tryget(css))
	1668	bio->bi_css = css;
	1669	rcu_read_unlock();
	1670
	1671	return 0;
	1672	}
	1673
	1674	/**
	1675	* bio_disassociate_task - undo bio_associate_current()
	1676	* @bio: target bio
	1677	*/
	1678	void bio_disassociate_task(struct bio *bio)
	1679	{
	1680	if (bio->bi_ioc) {
	1681	put_io_context(bio->bi_ioc);
	1682	bio->bi_ioc = NULL;
	1683	}
	1684	if (bio->bi_css) {
	1685	css_put(bio->bi_css);
	1686	bio->bi_css = NULL;
	1687	}
	1688	}
	1689
	1690	#endif /* CONFIG_BLK_CGROUP */
	1691
	1692	static void __init biovec_init_slabs(void)
	1693	{
	1694	int i;
	1695
	1696	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
	1697	int size;
	1698	struct biovec_slab *bvs = bvec_slabs + i;
	1699
	1700	if (bvs->nr_vecs <= BIO_INLINE_VECS) {
	1701	bvs->slab = NULL;
	1702	continue;
	1703	}
	1704
	1705	size = bvs->nr_vecs * sizeof(struct bio_vec);
	1706	bvs->slab = kmem_cache_create(bvs->name, size, 0,
	1707	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
	1708	}
	1709	}
	1710
	1711	static int __init init_bio(void)
	1712	{
	1713	bio_slab_max = 2;
	1714	bio_slab_nr = 0;
	1715	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
	1716	if (!bio_slabs)
	1717	panic("bio: can't allocate bios\n");
	1718
	1719	bio_integrity_init();
	1720	biovec_init_slabs();
	1721
	1722	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
	1723	if (!fs_bio_set)
	1724	panic("bio: can't allocate bios\n");
	1725
	1726	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
	1727	panic("bio: can't create integrity pool\n");
	1728
	1729	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
	1730	sizeof(struct bio_pair));
	1731	if (!bio_split_pool)
	1732	panic("bio: can't create split pool\n");
	1733
	1734	return 0;
	1735	}
	1736	subsys_initcall(init_bio);