Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* linux/fs/block_dev.c
	3	*
	4	* Copyright (C) 1991, 1992 Linus Torvalds
	5	* Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE
	6	*/
	7
	8	#include <linux/init.h>
	9	#include <linux/mm.h>
	10	#include <linux/fcntl.h>
	11	#include <linux/slab.h>
	12	#include <linux/kmod.h>
	13	#include <linux/major.h>
	14	#include <linux/device_cgroup.h>
	15	#include <linux/highmem.h>
	16	#include <linux/blkdev.h>
	17	#include <linux/backing-dev.h>
	18	#include <linux/module.h>
	19	#include <linux/blkpg.h>
	20	#include <linux/magic.h>
	21	#include <linux/dax.h>
	22	#include <linux/buffer_head.h>
	23	#include <linux/swap.h>
	24	#include <linux/pagevec.h>
	25	#include <linux/writeback.h>
	26	#include <linux/mpage.h>
	27	#include <linux/mount.h>
	28	#include <linux/uio.h>
	29	#include <linux/namei.h>
	30	#include <linux/log2.h>
	31	#include <linux/cleancache.h>
	32	#include <linux/dax.h>
	33	#include <linux/badblocks.h>
	34	#include <linux/task_io_accounting_ops.h>
	35	#include <linux/falloc.h>
	36	#include <linux/uaccess.h>
	37	#include "internal.h"
	38
	39	struct bdev_inode {
	40	struct block_device bdev;
	41	struct inode vfs_inode;
	42	};
	43
	44	static const struct address_space_operations def_blk_aops;
	45
	46	static inline struct bdev_inode BDEV_I(struct inode inode)
	47	{
	48	return container_of(inode, struct bdev_inode, vfs_inode);
	49	}
	50
	51	struct block_device I_BDEV(struct inode inode)
	52	{
	53	return &BDEV_I(inode)->bdev;
	54	}
	55	EXPORT_SYMBOL(I_BDEV);
	56
	57	static void bdev_write_inode(struct block_device *bdev)
	58	{
	59	struct inode *inode = bdev->bd_inode;
	60	int ret;
	61
	62	spin_lock(&inode->i_lock);
	63	while (inode->i_state & I_DIRTY) {
	64	spin_unlock(&inode->i_lock);
	65	ret = write_inode_now(inode, true);
	66	if (ret) {
	67	char name[BDEVNAME_SIZE];
	68	pr_warn_ratelimited("VFS: Dirty inode writeback failed "
	69	"for block device %s (err=%d).\n",
	70	bdevname(bdev, name), ret);
	71	}
	72	spin_lock(&inode->i_lock);
	73	}
	74	spin_unlock(&inode->i_lock);
	75	}
	76
	77	/* Kill _all_ buffers and pagecache , dirty or not.. */
	78	void kill_bdev(struct block_device *bdev)
	79	{
	80	struct address_space *mapping = bdev->bd_inode->i_mapping;
	81
	82	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
	83	return;
	84
	85	invalidate_bh_lrus();
	86	truncate_inode_pages(mapping, 0);
	87	}
	88	EXPORT_SYMBOL(kill_bdev);
	89
	90	/* Invalidate clean unused buffers and pagecache. */
	91	void invalidate_bdev(struct block_device *bdev)
	92	{
	93	struct address_space *mapping = bdev->bd_inode->i_mapping;
	94
	95	if (mapping->nrpages) {
	96	invalidate_bh_lrus();
	97	lru_add_drain_all(); /* make sure all lru add caches are flushed */
	98	invalidate_mapping_pages(mapping, 0, -1);
	99	}
	100	/* 99% of the time, we don't need to flush the cleancache on the bdev.
	101	* But, for the strange corners, lets be cautious
	102	*/
	103	cleancache_invalidate_inode(mapping);
	104	}
	105	EXPORT_SYMBOL(invalidate_bdev);
	106
	107	static void set_init_blocksize(struct block_device *bdev)
	108	{
	109	unsigned bsize = bdev_logical_block_size(bdev);
	110	loff_t size = i_size_read(bdev->bd_inode);
	111
	112	while (bsize < PAGE_SIZE) {
	113	if (size & bsize)
	114	break;
	115	bsize <<= 1;
	116	}
	117	bdev->bd_block_size = bsize;
	118	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
	119	}
	120
	121	int set_blocksize(struct block_device *bdev, int size)
	122	{
	123	/* Size must be a power of two, and between 512 and PAGE_SIZE */
	124	if (size > PAGE_SIZE \|\| size < 512 \|\| !is_power_of_2(size))
	125	return -EINVAL;
	126
	127	/* Size cannot be smaller than the size supported by the device */
	128	if (size < bdev_logical_block_size(bdev))
	129	return -EINVAL;
	130
	131	/* Don't change the size if it is same as current */
	132	if (bdev->bd_block_size != size) {
	133	sync_blockdev(bdev);
	134	bdev->bd_block_size = size;
	135	bdev->bd_inode->i_blkbits = blksize_bits(size);
	136	kill_bdev(bdev);
	137	}
	138	return 0;
	139	}
	140
	141	EXPORT_SYMBOL(set_blocksize);
	142
	143	int sb_set_blocksize(struct super_block *sb, int size)
	144	{
	145	if (set_blocksize(sb->s_bdev, size))
	146	return 0;
	147	/* If we get here, we know size is power of two
	148	* and it's value is between 512 and PAGE_SIZE */
	149	sb->s_blocksize = size;
	150	sb->s_blocksize_bits = blksize_bits(size);
	151	return sb->s_blocksize;
	152	}
	153
	154	EXPORT_SYMBOL(sb_set_blocksize);
	155
	156	int sb_min_blocksize(struct super_block *sb, int size)
	157	{
	158	int minsize = bdev_logical_block_size(sb->s_bdev);
	159	if (size < minsize)
	160	size = minsize;
	161	return sb_set_blocksize(sb, size);
	162	}
	163
	164	EXPORT_SYMBOL(sb_min_blocksize);
	165
	166	static int
	167	blkdev_get_block(struct inode *inode, sector_t iblock,
	168	struct buffer_head *bh, int create)
	169	{
	170	bh->b_bdev = I_BDEV(inode);
	171	bh->b_blocknr = iblock;
	172	set_buffer_mapped(bh);
	173	return 0;
	174	}
	175
	176	static struct inode bdev_file_inode(struct file file)
	177	{
	178	return file->f_mapping->host;
	179	}
	180
	181	static unsigned int dio_bio_write_op(struct kiocb *iocb)
	182	{
	183	unsigned int op = REQ_OP_WRITE \| REQ_SYNC \| REQ_IDLE;
	184
	185	/* avoid the need for a I/O completion work item */
	186	if (iocb->ki_flags & IOCB_DSYNC)
	187	op \|= REQ_FUA;
	188	return op;
	189	}
	190
	191	#define DIO_INLINE_BIO_VECS 4
	192
	193	static void blkdev_bio_end_io_simple(struct bio *bio)
	194	{
	195	struct task_struct *waiter = bio->bi_private;
	196
	197	WRITE_ONCE(bio->bi_private, NULL);
	198	blk_wake_io_task(waiter);
	199	}
	200
	201	static ssize_t
	202	__blkdev_direct_IO_simple(struct kiocb iocb, struct iov_iter iter,
	203	int nr_pages)
	204	{
	205	struct file *file = iocb->ki_filp;
	206	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
	207	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], vecs, bvec;
	208	loff_t pos = iocb->ki_pos;
	209	bool should_dirty = false;
	210	struct bio bio;
	211	ssize_t ret;
	212	blk_qc_t qc;
	213	int i;
	214
	215	if ((pos \| iov_iter_alignment(iter)) &
	216	(bdev_logical_block_size(bdev) - 1))
	217	return -EINVAL;
	218
	219	if (nr_pages <= DIO_INLINE_BIO_VECS)
	220	vecs = inline_vecs;
	221	else {
	222	vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
	223	GFP_KERNEL);
	224	if (!vecs)
	225	return -ENOMEM;
	226	}
	227
	228	bio_init(&bio, vecs, nr_pages);
	229	bio_set_dev(&bio, bdev);
	230	bio.bi_iter.bi_sector = pos >> 9;
	231	bio.bi_write_hint = iocb->ki_hint;
	232	bio.bi_private = current;
	233	bio.bi_end_io = blkdev_bio_end_io_simple;
	234	bio.bi_ioprio = iocb->ki_ioprio;
	235
	236	ret = bio_iov_iter_get_pages(&bio, iter);
	237	if (unlikely(ret))
	238	goto out;
	239	ret = bio.bi_iter.bi_size;
	240
	241	if (iov_iter_rw(iter) == READ) {
	242	bio.bi_opf = REQ_OP_READ;
	243	if (iter_is_iovec(iter))
	244	should_dirty = true;
	245	} else {
	246	bio.bi_opf = dio_bio_write_op(iocb);
	247	task_io_account_write(ret);
	248	}
	249	if (iocb->ki_flags & IOCB_HIPRI)
	250	bio.bi_opf \|= REQ_HIPRI;
	251
	252	qc = submit_bio(&bio);
	253	for (;;) {
	254	set_current_state(TASK_UNINTERRUPTIBLE);
	255	if (!READ_ONCE(bio.bi_private))
	256	break;
	257	if (!(iocb->ki_flags & IOCB_HIPRI) \|\|
	258	!blk_poll(bdev_get_queue(bdev), qc, true))
	259	io_schedule();
	260	}
	261	__set_current_state(TASK_RUNNING);
	262
	263	bio_for_each_segment_all(bvec, &bio, i) {
	264	if (should_dirty && !PageCompound(bvec->bv_page))
	265	set_page_dirty_lock(bvec->bv_page);
	266	put_page(bvec->bv_page);
	267	}
	268
	269	if (unlikely(bio.bi_status))
	270	ret = blk_status_to_errno(bio.bi_status);
	271
	272	out:
	273	if (vecs != inline_vecs)
	274	kfree(vecs);
	275
	276	bio_uninit(&bio);
	277
	278	return ret;
	279	}
	280
	281	struct blkdev_dio {
	282	union {
	283	struct kiocb *iocb;
	284	struct task_struct *waiter;
	285	};
	286	size_t size;
	287	atomic_t ref;
	288	bool multi_bio : 1;
	289	bool should_dirty : 1;
	290	bool is_sync : 1;
	291	struct bio bio;
	292	};
	293
	294	static struct bio_set blkdev_dio_pool;
	295
	296	static void blkdev_bio_end_io(struct bio *bio)
	297	{
	298	struct blkdev_dio *dio = bio->bi_private;
	299	bool should_dirty = dio->should_dirty;
	300
	301	if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
	302	if (bio->bi_status && !dio->bio.bi_status)
	303	dio->bio.bi_status = bio->bi_status;
	304	} else {
	305	if (!dio->is_sync) {
	306	struct kiocb *iocb = dio->iocb;
	307	ssize_t ret;
	308
	309	if (likely(!dio->bio.bi_status)) {
	310	ret = dio->size;
	311	iocb->ki_pos += ret;
	312	} else {
	313	ret = blk_status_to_errno(dio->bio.bi_status);
	314	}
	315
	316	dio->iocb->ki_complete(iocb, ret, 0);
	317	if (dio->multi_bio)
	318	bio_put(&dio->bio);
	319	} else {
	320	struct task_struct *waiter = dio->waiter;
	321
	322	WRITE_ONCE(dio->waiter, NULL);
	323	blk_wake_io_task(waiter);
	324	}
	325	}
	326
	327	if (should_dirty) {
	328	bio_check_pages_dirty(bio);
	329	} else {
	330	struct bio_vec *bvec;
	331	int i;
	332
	333	bio_for_each_segment_all(bvec, bio, i)
	334	put_page(bvec->bv_page);
	335	bio_put(bio);
	336	}
	337	}
	338
	339	static ssize_t
	340	__blkdev_direct_IO(struct kiocb iocb, struct iov_iter iter, int nr_pages)
	341	{
	342	struct file *file = iocb->ki_filp;
	343	struct inode *inode = bdev_file_inode(file);
	344	struct block_device *bdev = I_BDEV(inode);
	345	struct blk_plug plug;
	346	struct blkdev_dio *dio;
	347	struct bio *bio;
	348	bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
	349	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
	350	loff_t pos = iocb->ki_pos;
	351	blk_qc_t qc = BLK_QC_T_NONE;
	352	int ret = 0;
	353
	354	if ((pos \| iov_iter_alignment(iter)) &
	355	(bdev_logical_block_size(bdev) - 1))
	356	return -EINVAL;
	357
	358	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
	359
	360	dio = container_of(bio, struct blkdev_dio, bio);
	361	dio->is_sync = is_sync = is_sync_kiocb(iocb);
	362	if (dio->is_sync) {
	363	dio->waiter = current;
	364	bio_get(bio);
	365	} else {
	366	dio->iocb = iocb;
	367	}
	368
	369	dio->size = 0;
	370	dio->multi_bio = false;
	371	dio->should_dirty = is_read && iter_is_iovec(iter);
	372
	373	/*
	374	* Don't plug for HIPRI/polled IO, as those should go straight
	375	* to issue
	376	*/
	377	if (!is_poll)
	378	blk_start_plug(&plug);
	379
	380	for (;;) {
	381	bio_set_dev(bio, bdev);
	382	bio->bi_iter.bi_sector = pos >> 9;
	383	bio->bi_write_hint = iocb->ki_hint;
	384	bio->bi_private = dio;
	385	bio->bi_end_io = blkdev_bio_end_io;
	386	bio->bi_ioprio = iocb->ki_ioprio;
	387
	388	ret = bio_iov_iter_get_pages(bio, iter);
	389	if (unlikely(ret)) {
	390	bio->bi_status = BLK_STS_IOERR;
	391	bio_endio(bio);
	392	break;
	393	}
	394
	395	if (is_read) {
	396	bio->bi_opf = REQ_OP_READ;
	397	if (dio->should_dirty)
	398	bio_set_pages_dirty(bio);
	399	} else {
	400	bio->bi_opf = dio_bio_write_op(iocb);
	401	task_io_account_write(bio->bi_iter.bi_size);
	402	}
	403
	404	dio->size += bio->bi_iter.bi_size;
	405	pos += bio->bi_iter.bi_size;
	406
	407	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
	408	if (!nr_pages) {
	409	if (iocb->ki_flags & IOCB_HIPRI)
	410	bio->bi_opf \|= REQ_HIPRI;
	411
	412	qc = submit_bio(bio);
	413	break;
	414	}
	415
	416	if (!dio->multi_bio) {
	417	/*
	418	* AIO needs an extra reference to ensure the dio
	419	* structure which is embedded into the first bio
	420	* stays around.
	421	*/
	422	if (!is_sync)
	423	bio_get(bio);
	424	dio->multi_bio = true;
	425	atomic_set(&dio->ref, 2);
	426	} else {
	427	atomic_inc(&dio->ref);
	428	}
	429
	430	submit_bio(bio);
	431	bio = bio_alloc(GFP_KERNEL, nr_pages);
	432	}
	433
	434	if (!is_poll)
	435	blk_finish_plug(&plug);
	436
	437	if (!is_sync)
	438	return -EIOCBQUEUED;
	439
	440	for (;;) {
	441	set_current_state(TASK_UNINTERRUPTIBLE);
	442	if (!READ_ONCE(dio->waiter))
	443	break;
	444
	445	if (!(iocb->ki_flags & IOCB_HIPRI) \|\|
	446	!blk_poll(bdev_get_queue(bdev), qc, true))
	447	io_schedule();
	448	}
	449	__set_current_state(TASK_RUNNING);
	450
	451	if (!ret)
	452	ret = blk_status_to_errno(dio->bio.bi_status);
	453	if (likely(!ret))
	454	ret = dio->size;
	455
	456	bio_put(&dio->bio);
	457	return ret;
	458	}
	459
	460	static ssize_t
	461	blkdev_direct_IO(struct kiocb iocb, struct iov_iter iter)
	462	{
	463	int nr_pages;
	464
	465	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
	466	if (!nr_pages)
	467	return 0;
	468	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
	469	return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
	470
	471	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
	472	}
	473
	474	static __init int blkdev_init(void)
	475	{
	476	return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
	477	}
	478	module_init(blkdev_init);
	479
	480	int __sync_blockdev(struct block_device *bdev, int wait)
	481	{
	482	if (!bdev)
	483	return 0;
	484	if (!wait)
	485	return filemap_flush(bdev->bd_inode->i_mapping);
	486	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
	487	}
	488
	489	/*
	490	* Write out and wait upon all the dirty data associated with a block
	491	* device via its mapping. Does not take the superblock lock.
	492	*/
	493	int sync_blockdev(struct block_device *bdev)
	494	{
	495	return __sync_blockdev(bdev, 1);
	496	}
	497	EXPORT_SYMBOL(sync_blockdev);
	498
	499	/*
	500	* Write out and wait upon all dirty data associated with this
	501	* device. Filesystem data as well as the underlying block
	502	* device. Takes the superblock lock.
	503	*/
	504	int fsync_bdev(struct block_device *bdev)
	505	{
	506	struct super_block *sb = get_super(bdev);
	507	if (sb) {
	508	int res = sync_filesystem(sb);
	509	drop_super(sb);
	510	return res;
	511	}
	512	return sync_blockdev(bdev);
	513	}
	514	EXPORT_SYMBOL(fsync_bdev);
	515
	516	/**
	517	* freeze_bdev -- lock a filesystem and force it into a consistent state
	518	* @bdev: blockdevice to lock
	519	*
	520	* If a superblock is found on this device, we take the s_umount semaphore
	521	* on it to make sure nobody unmounts until the snapshot creation is done.
	522	* The reference counter (bd_fsfreeze_count) guarantees that only the last
	523	* unfreeze process can unfreeze the frozen filesystem actually when multiple
	524	* freeze requests arrive simultaneously. It counts up in freeze_bdev() and
	525	* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
	526	* actually.
	527	*/
	528	struct super_block freeze_bdev(struct block_device bdev)
	529	{
	530	struct super_block *sb;
	531	int error = 0;
	532
	533	mutex_lock(&bdev->bd_fsfreeze_mutex);
	534	if (++bdev->bd_fsfreeze_count > 1) {
	535	/*
	536	* We don't even need to grab a reference - the first call
	537	* to freeze_bdev grab an active reference and only the last
	538	* thaw_bdev drops it.
	539	*/
	540	sb = get_super(bdev);
	541	if (sb)
	542	drop_super(sb);
	543	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	544	return sb;
	545	}
	546
	547	sb = get_active_super(bdev);
	548	if (!sb)
	549	goto out;
	550	if (sb->s_op->freeze_super)
	551	error = sb->s_op->freeze_super(sb);
	552	else
	553	error = freeze_super(sb);
	554	if (error) {
	555	deactivate_super(sb);
	556	bdev->bd_fsfreeze_count--;
	557	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	558	return ERR_PTR(error);
	559	}
	560	deactivate_super(sb);
	561	out:
	562	sync_blockdev(bdev);
	563	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	564	return sb; /* thaw_bdev releases s->s_umount */
	565	}
	566	EXPORT_SYMBOL(freeze_bdev);
	567
	568	/**
	569	* thaw_bdev -- unlock filesystem
	570	* @bdev: blockdevice to unlock
	571	* @sb: associated superblock
	572	*
	573	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
	574	*/
	575	int thaw_bdev(struct block_device bdev, struct super_block sb)
	576	{
	577	int error = -EINVAL;
	578
	579	mutex_lock(&bdev->bd_fsfreeze_mutex);
	580	if (!bdev->bd_fsfreeze_count)
	581	goto out;
	582
	583	error = 0;
	584	if (--bdev->bd_fsfreeze_count > 0)
	585	goto out;
	586
	587	if (!sb)
	588	goto out;
	589
	590	if (sb->s_op->thaw_super)
	591	error = sb->s_op->thaw_super(sb);
	592	else
	593	error = thaw_super(sb);
	594	if (error)
	595	bdev->bd_fsfreeze_count++;
	596	out:
	597	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	598	return error;
	599	}
	600	EXPORT_SYMBOL(thaw_bdev);
	601
	602	static int blkdev_writepage(struct page page, struct writeback_control wbc)
	603	{
	604	return block_write_full_page(page, blkdev_get_block, wbc);
	605	}
	606
	607	static int blkdev_readpage(struct file * file, struct page * page)
	608	{
	609	return block_read_full_page(page, blkdev_get_block);
	610	}
	611
	612	static int blkdev_readpages(struct file file, struct address_space mapping,
	613	struct list_head *pages, unsigned nr_pages)
	614	{
	615	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
	616	}
	617
	618	static int blkdev_write_begin(struct file file, struct address_space mapping,
	619	loff_t pos, unsigned len, unsigned flags,
	620	struct page pagep, void fsdata)
	621	{
	622	return block_write_begin(mapping, pos, len, flags, pagep,
	623	blkdev_get_block);
	624	}
	625
	626	static int blkdev_write_end(struct file file, struct address_space mapping,
	627	loff_t pos, unsigned len, unsigned copied,
	628	struct page page, void fsdata)
	629	{
	630	int ret;
	631	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
	632
	633	unlock_page(page);
	634	put_page(page);
	635
	636	return ret;
	637	}
	638
	639	/*
	640	* private llseek:
	641	* for a block special file file_inode(file)->i_size is zero
	642	* so we compute the size by hand (just as in block_read/write above)
	643	*/
	644	static loff_t block_llseek(struct file *file, loff_t offset, int whence)
	645	{
	646	struct inode *bd_inode = bdev_file_inode(file);
	647	loff_t retval;
	648
	649	inode_lock(bd_inode);
	650	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
	651	inode_unlock(bd_inode);
	652	return retval;
	653	}
	654
	655	int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
	656	{
	657	struct inode *bd_inode = bdev_file_inode(filp);
	658	struct block_device *bdev = I_BDEV(bd_inode);
	659	int error;
	660
	661	error = file_write_and_wait_range(filp, start, end);
	662	if (error)
	663	return error;
	664
	665	/*
	666	* There is no need to serialise calls to blkdev_issue_flush with
	667	* i_mutex and doing so causes performance issues with concurrent
	668	* O_SYNC writers to a block device.
	669	*/
	670	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
	671	if (error == -EOPNOTSUPP)
	672	error = 0;
	673
	674	return error;
	675	}
	676	EXPORT_SYMBOL(blkdev_fsync);
	677
	678	/**
	679	* bdev_read_page() - Start reading a page from a block device
	680	* @bdev: The device to read the page from
	681	* @sector: The offset on the device to read the page to (need not be aligned)
	682	* @page: The page to read
	683	*
	684	* On entry, the page should be locked. It will be unlocked when the page
	685	* has been read. If the block driver implements rw_page synchronously,
	686	* that will be true on exit from this function, but it need not be.
	687	*
	688	* Errors returned by this function are usually "soft", eg out of memory, or
	689	* queue full; callers should try a different route to read this page rather
	690	* than propagate an error back up the stack.
	691	*
	692	* Return: negative errno if an error occurs, 0 if submission was successful.
	693	*/
	694	int bdev_read_page(struct block_device *bdev, sector_t sector,
	695	struct page *page)
	696	{
	697	const struct block_device_operations *ops = bdev->bd_disk->fops;
	698	int result = -EOPNOTSUPP;
	699
	700	if (!ops->rw_page \|\| bdev_get_integrity(bdev))
	701	return result;
	702
	703	result = blk_queue_enter(bdev->bd_queue, 0);
	704	if (result)
	705	return result;
	706	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
	707	REQ_OP_READ);
	708	blk_queue_exit(bdev->bd_queue);
	709	return result;
	710	}
	711	EXPORT_SYMBOL_GPL(bdev_read_page);
	712
	713	/**
	714	* bdev_write_page() - Start writing a page to a block device
	715	* @bdev: The device to write the page to
	716	* @sector: The offset on the device to write the page to (need not be aligned)
	717	* @page: The page to write
	718	* @wbc: The writeback_control for the write
	719	*
	720	* On entry, the page should be locked and not currently under writeback.
	721	* On exit, if the write started successfully, the page will be unlocked and
	722	* under writeback. If the write failed already (eg the driver failed to
	723	* queue the page to the device), the page will still be locked. If the
	724	* caller is a ->writepage implementation, it will need to unlock the page.
	725	*
	726	* Errors returned by this function are usually "soft", eg out of memory, or
	727	* queue full; callers should try a different route to write this page rather
	728	* than propagate an error back up the stack.
	729	*
	730	* Return: negative errno if an error occurs, 0 if submission was successful.
	731	*/
	732	int bdev_write_page(struct block_device *bdev, sector_t sector,
	733	struct page page, struct writeback_control wbc)
	734	{
	735	int result;
	736	const struct block_device_operations *ops = bdev->bd_disk->fops;
	737
	738	if (!ops->rw_page \|\| bdev_get_integrity(bdev))
	739	return -EOPNOTSUPP;
	740	result = blk_queue_enter(bdev->bd_queue, 0);
	741	if (result)
	742	return result;
	743
	744	set_page_writeback(page);
	745	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
	746	REQ_OP_WRITE);
	747	if (result) {
	748	end_page_writeback(page);
	749	} else {
	750	clean_page_buffers(page);
	751	unlock_page(page);
	752	}
	753	blk_queue_exit(bdev->bd_queue);
	754	return result;
	755	}
	756	EXPORT_SYMBOL_GPL(bdev_write_page);
	757
	758	/*
	759	* pseudo-fs
	760	*/
	761
	762	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
	763	static struct kmem_cache * bdev_cachep __read_mostly;
	764
	765	static struct inode bdev_alloc_inode(struct super_block sb)
	766	{
	767	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
	768	if (!ei)
	769	return NULL;
	770	return &ei->vfs_inode;
	771	}
	772
	773	static void bdev_i_callback(struct rcu_head *head)
	774	{
	775	struct inode *inode = container_of(head, struct inode, i_rcu);
	776	struct bdev_inode *bdi = BDEV_I(inode);
	777
	778	kmem_cache_free(bdev_cachep, bdi);
	779	}
	780
	781	static void bdev_destroy_inode(struct inode *inode)
	782	{
	783	call_rcu(&inode->i_rcu, bdev_i_callback);
	784	}
	785
	786	static void init_once(void *foo)
	787	{
	788	struct bdev_inode ei = (struct bdev_inode ) foo;
	789	struct block_device *bdev = &ei->bdev;
	790
	791	memset(bdev, 0, sizeof(*bdev));
	792	mutex_init(&bdev->bd_mutex);
	793	INIT_LIST_HEAD(&bdev->bd_list);
	794	#ifdef CONFIG_SYSFS
	795	INIT_LIST_HEAD(&bdev->bd_holder_disks);
	796	#endif
	797	bdev->bd_bdi = &noop_backing_dev_info;
	798	inode_init_once(&ei->vfs_inode);
	799	/* Initialize mutex for freeze. */
	800	mutex_init(&bdev->bd_fsfreeze_mutex);
	801	}
	802
	803	static void bdev_evict_inode(struct inode *inode)
	804	{
	805	struct block_device *bdev = &BDEV_I(inode)->bdev;
	806	truncate_inode_pages_final(&inode->i_data);
	807	invalidate_inode_buffers(inode); /* is it needed here? */
	808	clear_inode(inode);
	809	spin_lock(&bdev_lock);
	810	list_del_init(&bdev->bd_list);
	811	spin_unlock(&bdev_lock);
	812	/* Detach inode from wb early as bdi_put() may free bdi->wb */
	813	inode_detach_wb(inode);
	814	if (bdev->bd_bdi != &noop_backing_dev_info) {
	815	bdi_put(bdev->bd_bdi);
	816	bdev->bd_bdi = &noop_backing_dev_info;
	817	}
	818	}
	819
	820	static const struct super_operations bdev_sops = {
	821	.statfs = simple_statfs,
	822	.alloc_inode = bdev_alloc_inode,
	823	.destroy_inode = bdev_destroy_inode,
	824	.drop_inode = generic_delete_inode,
	825	.evict_inode = bdev_evict_inode,
	826	};
	827
	828	static struct dentry bd_mount(struct file_system_type fs_type,
	829	int flags, const char dev_name, void data)
	830	{
	831	struct dentry *dent;
	832	dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
	833	if (!IS_ERR(dent))
	834	dent->d_sb->s_iflags \|= SB_I_CGROUPWB;
	835	return dent;
	836	}
	837
	838	static struct file_system_type bd_type = {
	839	.name = "bdev",
	840	.mount = bd_mount,
	841	.kill_sb = kill_anon_super,
	842	};
	843
	844	struct super_block *blockdev_superblock __read_mostly;
	845	EXPORT_SYMBOL_GPL(blockdev_superblock);
	846
	847	void __init bdev_cache_init(void)
	848	{
	849	int err;
	850	static struct vfsmount *bd_mnt;
	851
	852	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
	853	0, (SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|
	854	SLAB_MEM_SPREAD\|SLAB_ACCOUNT\|SLAB_PANIC),
	855	init_once);
	856	err = register_filesystem(&bd_type);
	857	if (err)
	858	panic("Cannot register bdev pseudo-fs");
	859	bd_mnt = kern_mount(&bd_type);
	860	if (IS_ERR(bd_mnt))
	861	panic("Cannot create bdev pseudo-fs");
	862	blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
	863	}
	864
	865	/*
	866	* Most likely _very_ bad one - but then it's hardly critical for small
	867	* /dev and can be fixed when somebody will need really large one.
	868	* Keep in mind that it will be fed through icache hash function too.
	869	*/
	870	static inline unsigned long hash(dev_t dev)
	871	{
	872	return MAJOR(dev)+MINOR(dev);
	873	}
	874
	875	static int bdev_test(struct inode inode, void data)
	876	{
	877	return BDEV_I(inode)->bdev.bd_dev == (dev_t )data;
	878	}
	879
	880	static int bdev_set(struct inode inode, void data)
	881	{
	882	BDEV_I(inode)->bdev.bd_dev = (dev_t )data;
	883	return 0;
	884	}
	885
	886	static LIST_HEAD(all_bdevs);
	887
	888	/*
	889	* If there is a bdev inode for this device, unhash it so that it gets evicted
	890	* as soon as last inode reference is dropped.
	891	*/
	892	void bdev_unhash_inode(dev_t dev)
	893	{
	894	struct inode *inode;
	895
	896	inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev);
	897	if (inode) {
	898	remove_inode_hash(inode);
	899	iput(inode);
	900	}
	901	}
	902
	903	struct block_device *bdget(dev_t dev)
	904	{
	905	struct block_device *bdev;
	906	struct inode *inode;
	907
	908	inode = iget5_locked(blockdev_superblock, hash(dev),
	909	bdev_test, bdev_set, &dev);
	910
	911	if (!inode)
	912	return NULL;
	913
	914	bdev = &BDEV_I(inode)->bdev;
	915
	916	if (inode->i_state & I_NEW) {
	917	bdev->bd_contains = NULL;
	918	bdev->bd_super = NULL;
	919	bdev->bd_inode = inode;
	920	bdev->bd_block_size = i_blocksize(inode);
	921	bdev->bd_part_count = 0;
	922	bdev->bd_invalidated = 0;
	923	inode->i_mode = S_IFBLK;
	924	inode->i_rdev = dev;
	925	inode->i_bdev = bdev;
	926	inode->i_data.a_ops = &def_blk_aops;
	927	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
	928	spin_lock(&bdev_lock);
	929	list_add(&bdev->bd_list, &all_bdevs);
	930	spin_unlock(&bdev_lock);
	931	unlock_new_inode(inode);
	932	}
	933	return bdev;
	934	}
	935
	936	EXPORT_SYMBOL(bdget);
	937
	938	/**
	939	* bdgrab -- Grab a reference to an already referenced block device
	940	* @bdev: Block device to grab a reference to.
	941	*/
	942	struct block_device bdgrab(struct block_device bdev)
	943	{
	944	ihold(bdev->bd_inode);
	945	return bdev;
	946	}
	947	EXPORT_SYMBOL(bdgrab);
	948
	949	long nr_blockdev_pages(void)
	950	{
	951	struct block_device *bdev;
	952	long ret = 0;
	953	spin_lock(&bdev_lock);
	954	list_for_each_entry(bdev, &all_bdevs, bd_list) {
	955	ret += bdev->bd_inode->i_mapping->nrpages;
	956	}
	957	spin_unlock(&bdev_lock);
	958	return ret;
	959	}
	960
	961	void bdput(struct block_device *bdev)
	962	{
	963	iput(bdev->bd_inode);
	964	}
	965
	966	EXPORT_SYMBOL(bdput);
	967
	968	static struct block_device bd_acquire(struct inode inode)
	969	{
	970	struct block_device *bdev;
	971
	972	spin_lock(&bdev_lock);
	973	bdev = inode->i_bdev;
	974	if (bdev && !inode_unhashed(bdev->bd_inode)) {
	975	bdgrab(bdev);
	976	spin_unlock(&bdev_lock);
	977	return bdev;
	978	}
	979	spin_unlock(&bdev_lock);
	980
	981	/*
	982	* i_bdev references block device inode that was already shut down
	983	* (corresponding device got removed). Remove the reference and look
	984	* up block device inode again just in case new device got
	985	* reestablished under the same device number.
	986	*/
	987	if (bdev)
	988	bd_forget(inode);
	989
	990	bdev = bdget(inode->i_rdev);
	991	if (bdev) {
	992	spin_lock(&bdev_lock);
	993	if (!inode->i_bdev) {
	994	/*
	995	* We take an additional reference to bd_inode,
	996	* and it's released in clear_inode() of inode.
	997	* So, we can access it via ->i_mapping always
	998	* without igrab().
	999	*/
	1000	bdgrab(bdev);
	1001	inode->i_bdev = bdev;
	1002	inode->i_mapping = bdev->bd_inode->i_mapping;
	1003	}
	1004	spin_unlock(&bdev_lock);
	1005	}
	1006	return bdev;
	1007	}
	1008
	1009	/* Call when you free inode */
	1010
	1011	void bd_forget(struct inode *inode)
	1012	{
	1013	struct block_device *bdev = NULL;
	1014
	1015	spin_lock(&bdev_lock);
	1016	if (!sb_is_blkdev_sb(inode->i_sb))
	1017	bdev = inode->i_bdev;
	1018	inode->i_bdev = NULL;
	1019	inode->i_mapping = &inode->i_data;
	1020	spin_unlock(&bdev_lock);
	1021
	1022	if (bdev)
	1023	bdput(bdev);
	1024	}
	1025
	1026	/**
	1027	* bd_may_claim - test whether a block device can be claimed
	1028	* @bdev: block device of interest
	1029	* @whole: whole block device containing @bdev, may equal @bdev
	1030	* @holder: holder trying to claim @bdev
	1031	*
	1032	* Test whether @bdev can be claimed by @holder.
	1033	*
	1034	* CONTEXT:
	1035	* spin_lock(&bdev_lock).
	1036	*
	1037	* RETURNS:
	1038	* %true if @bdev can be claimed, %false otherwise.
	1039	*/
	1040	static bool bd_may_claim(struct block_device bdev, struct block_device whole,
	1041	void *holder)
	1042	{
	1043	if (bdev->bd_holder == holder)
	1044	return true; /* already a holder */
	1045	else if (bdev->bd_holder != NULL)
	1046	return false; /* held by someone else */
	1047	else if (whole == bdev)
	1048	return true; /* is a whole device which isn't held */
	1049
	1050	else if (whole->bd_holder == bd_may_claim)
	1051	return true; /* is a partition of a device that is being partitioned */
	1052	else if (whole->bd_holder != NULL)
	1053	return false; /* is a partition of a held device */
	1054	else
	1055	return true; /* is a partition of an un-held device */
	1056	}
	1057
	1058	/**
	1059	* bd_prepare_to_claim - prepare to claim a block device
	1060	* @bdev: block device of interest
	1061	* @whole: the whole device containing @bdev, may equal @bdev
	1062	* @holder: holder trying to claim @bdev
	1063	*
	1064	* Prepare to claim @bdev. This function fails if @bdev is already
	1065	* claimed by another holder and waits if another claiming is in
	1066	* progress. This function doesn't actually claim. On successful
	1067	* return, the caller has ownership of bd_claiming and bd_holder[s].
	1068	*
	1069	* CONTEXT:
	1070	* spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
	1071	* it multiple times.
	1072	*
	1073	* RETURNS:
	1074	* 0 if @bdev can be claimed, -EBUSY otherwise.
	1075	*/
	1076	static int bd_prepare_to_claim(struct block_device *bdev,
	1077	struct block_device whole, void holder)
	1078	{
	1079	retry:
	1080	/* if someone else claimed, fail */
	1081	if (!bd_may_claim(bdev, whole, holder))
	1082	return -EBUSY;
	1083
	1084	/* if claiming is already in progress, wait for it to finish */
	1085	if (whole->bd_claiming) {
	1086	wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
	1087	DEFINE_WAIT(wait);
	1088
	1089	prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
	1090	spin_unlock(&bdev_lock);
	1091	schedule();
	1092	finish_wait(wq, &wait);
	1093	spin_lock(&bdev_lock);
	1094	goto retry;
	1095	}
	1096
	1097	/* yay, all mine */
	1098	return 0;
	1099	}
	1100
	1101	static struct gendisk bdev_get_gendisk(struct block_device bdev, int *partno)
	1102	{
	1103	struct gendisk *disk = get_gendisk(bdev->bd_dev, partno);
	1104
	1105	if (!disk)
	1106	return NULL;
	1107	/*
	1108	* Now that we hold gendisk reference we make sure bdev we looked up is
	1109	* not stale. If it is, it means device got removed and created before
	1110	* we looked up gendisk and we fail open in such case. Associating
	1111	* unhashed bdev with newly created gendisk could lead to two bdevs
	1112	* (and thus two independent caches) being associated with one device
	1113	* which is bad.
	1114	*/
	1115	if (inode_unhashed(bdev->bd_inode)) {
	1116	put_disk_and_module(disk);
	1117	return NULL;
	1118	}
	1119	return disk;
	1120	}
	1121
	1122	/**
	1123	* bd_start_claiming - start claiming a block device
	1124	* @bdev: block device of interest
	1125	* @holder: holder trying to claim @bdev
	1126	*
	1127	* @bdev is about to be opened exclusively. Check @bdev can be opened
	1128	* exclusively and mark that an exclusive open is in progress. Each
	1129	* successful call to this function must be matched with a call to
	1130	* either bd_finish_claiming() or bd_abort_claiming() (which do not
	1131	* fail).
	1132	*
	1133	* This function is used to gain exclusive access to the block device
	1134	* without actually causing other exclusive open attempts to fail. It
	1135	* should be used when the open sequence itself requires exclusive
	1136	* access but may subsequently fail.
	1137	*
	1138	* CONTEXT:
	1139	* Might sleep.
	1140	*
	1141	* RETURNS:
	1142	* Pointer to the block device containing @bdev on success, ERR_PTR()
	1143	* value on failure.
	1144	*/
	1145	static struct block_device bd_start_claiming(struct block_device bdev,
	1146	void *holder)
	1147	{
	1148	struct gendisk *disk;
	1149	struct block_device *whole;
	1150	int partno, err;
	1151
	1152	might_sleep();
	1153
	1154	/*
	1155	* @bdev might not have been initialized properly yet, look up
	1156	* and grab the outer block device the hard way.
	1157	*/
	1158	disk = bdev_get_gendisk(bdev, &partno);
	1159	if (!disk)
	1160	return ERR_PTR(-ENXIO);
	1161
	1162	/*
	1163	* Normally, @bdev should equal what's returned from bdget_disk()
	1164	* if partno is 0; however, some drivers (floppy) use multiple
	1165	* bdev's for the same physical device and @bdev may be one of the
	1166	* aliases. Keep @bdev if partno is 0. This means claimer
	1167	* tracking is broken for those devices but it has always been that
	1168	* way.
	1169	*/
	1170	if (partno)
	1171	whole = bdget_disk(disk, 0);
	1172	else
	1173	whole = bdgrab(bdev);
	1174
	1175	put_disk_and_module(disk);
	1176	if (!whole)
	1177	return ERR_PTR(-ENOMEM);
	1178
	1179	/* prepare to claim, if successful, mark claiming in progress */
	1180	spin_lock(&bdev_lock);
	1181
	1182	err = bd_prepare_to_claim(bdev, whole, holder);
	1183	if (err == 0) {
	1184	whole->bd_claiming = holder;
	1185	spin_unlock(&bdev_lock);
	1186	return whole;
	1187	} else {
	1188	spin_unlock(&bdev_lock);
	1189	bdput(whole);
	1190	return ERR_PTR(err);
	1191	}
	1192	}
	1193
	1194	#ifdef CONFIG_SYSFS
	1195	struct bd_holder_disk {
	1196	struct list_head list;
	1197	struct gendisk *disk;
	1198	int refcnt;
	1199	};
	1200
	1201	static struct bd_holder_disk bd_find_holder_disk(struct block_device bdev,
	1202	struct gendisk *disk)
	1203	{
	1204	struct bd_holder_disk *holder;
	1205
	1206	list_for_each_entry(holder, &bdev->bd_holder_disks, list)
	1207	if (holder->disk == disk)
	1208	return holder;
	1209	return NULL;
	1210	}
	1211
	1212	static int add_symlink(struct kobject from, struct kobject to)
	1213	{
	1214	return sysfs_create_link(from, to, kobject_name(to));
	1215	}
	1216
	1217	static void del_symlink(struct kobject from, struct kobject to)
	1218	{
	1219	sysfs_remove_link(from, kobject_name(to));
	1220	}
	1221
	1222	/**
	1223	* bd_link_disk_holder - create symlinks between holding disk and slave bdev
	1224	* @bdev: the claimed slave bdev
	1225	* @disk: the holding disk
	1226	*
	1227	* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
	1228	*
	1229	* This functions creates the following sysfs symlinks.
	1230	*
	1231	* - from "slaves" directory of the holder @disk to the claimed @bdev
	1232	* - from "holders" directory of the @bdev to the holder @disk
	1233	*
	1234	* For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
	1235	* passed to bd_link_disk_holder(), then:
	1236	*
	1237	* /sys/block/dm-0/slaves/sda --> /sys/block/sda
	1238	* /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
	1239	*
	1240	* The caller must have claimed @bdev before calling this function and
	1241	* ensure that both @bdev and @disk are valid during the creation and
	1242	* lifetime of these symlinks.
	1243	*
	1244	* CONTEXT:
	1245	* Might sleep.
	1246	*
	1247	* RETURNS:
	1248	* 0 on success, -errno on failure.
	1249	*/
	1250	int bd_link_disk_holder(struct block_device bdev, struct gendisk disk)
	1251	{
	1252	struct bd_holder_disk *holder;
	1253	int ret = 0;
	1254
	1255	mutex_lock(&bdev->bd_mutex);
	1256
	1257	WARN_ON_ONCE(!bdev->bd_holder);
	1258
	1259	/* FIXME: remove the following once add_disk() handles errors */
	1260	if (WARN_ON(!disk->slave_dir \|\| !bdev->bd_part->holder_dir))
	1261	goto out_unlock;
	1262
	1263	holder = bd_find_holder_disk(bdev, disk);
	1264	if (holder) {
	1265	holder->refcnt++;
	1266	goto out_unlock;
	1267	}
	1268
	1269	holder = kzalloc(sizeof(*holder), GFP_KERNEL);
	1270	if (!holder) {
	1271	ret = -ENOMEM;
	1272	goto out_unlock;
	1273	}
	1274
	1275	INIT_LIST_HEAD(&holder->list);
	1276	holder->disk = disk;
	1277	holder->refcnt = 1;
	1278
	1279	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
	1280	if (ret)
	1281	goto out_free;
	1282
	1283	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
	1284	if (ret)
	1285	goto out_del;
	1286	/*
	1287	* bdev could be deleted beneath us which would implicitly destroy
	1288	* the holder directory. Hold on to it.
	1289	*/
	1290	kobject_get(bdev->bd_part->holder_dir);
	1291
	1292	list_add(&holder->list, &bdev->bd_holder_disks);
	1293	goto out_unlock;
	1294
	1295	out_del:
	1296	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
	1297	out_free:
	1298	kfree(holder);
	1299	out_unlock:
	1300	mutex_unlock(&bdev->bd_mutex);
	1301	return ret;
	1302	}
	1303	EXPORT_SYMBOL_GPL(bd_link_disk_holder);
	1304
	1305	/**
	1306	* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
	1307	* @bdev: the calimed slave bdev
	1308	* @disk: the holding disk
	1309	*
	1310	* DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
	1311	*
	1312	* CONTEXT:
	1313	* Might sleep.
	1314	*/
	1315	void bd_unlink_disk_holder(struct block_device bdev, struct gendisk disk)
	1316	{
	1317	struct bd_holder_disk *holder;
	1318
	1319	mutex_lock(&bdev->bd_mutex);
	1320
	1321	holder = bd_find_holder_disk(bdev, disk);
	1322
	1323	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
	1324	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
	1325	del_symlink(bdev->bd_part->holder_dir,
	1326	&disk_to_dev(disk)->kobj);
	1327	kobject_put(bdev->bd_part->holder_dir);
	1328	list_del_init(&holder->list);
	1329	kfree(holder);
	1330	}
	1331
	1332	mutex_unlock(&bdev->bd_mutex);
	1333	}
	1334	EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
	1335	#endif
	1336
	1337	/**
	1338	* flush_disk - invalidates all buffer-cache entries on a disk
	1339	*
	1340	* @bdev: struct block device to be flushed
	1341	* @kill_dirty: flag to guide handling of dirty inodes
	1342	*
	1343	* Invalidates all buffer-cache entries on a disk. It should be called
	1344	* when a disk has been changed -- either by a media change or online
	1345	* resize.
	1346	*/
	1347	static void flush_disk(struct block_device *bdev, bool kill_dirty)
	1348	{
	1349	if (__invalidate_device(bdev, kill_dirty)) {
	1350	printk(KERN_WARNING "VFS: busy inodes on changed media or "
	1351	"resized disk %s\n",
	1352	bdev->bd_disk ? bdev->bd_disk->disk_name : "");
	1353	}
	1354
	1355	if (!bdev->bd_disk)
	1356	return;
	1357	if (disk_part_scan_enabled(bdev->bd_disk))
	1358	bdev->bd_invalidated = 1;
	1359	}
	1360
	1361	/**
	1362	* check_disk_size_change - checks for disk size change and adjusts bdev size.
	1363	* @disk: struct gendisk to check
	1364	* @bdev: struct bdev to adjust.
	1365	* @verbose: if %true log a message about a size change if there is any
	1366	*
	1367	* This routine checks to see if the bdev size does not match the disk size
	1368	* and adjusts it if it differs. When shrinking the bdev size, its all caches
	1369	* are freed.
	1370	*/
	1371	void check_disk_size_change(struct gendisk disk, struct block_device bdev,
	1372	bool verbose)
	1373	{
	1374	loff_t disk_size, bdev_size;
	1375
	1376	disk_size = (loff_t)get_capacity(disk) << 9;
	1377	bdev_size = i_size_read(bdev->bd_inode);
	1378	if (disk_size != bdev_size) {
	1379	if (verbose) {
	1380	printk(KERN_INFO
	1381	"%s: detected capacity change from %lld to %lld\n",
	1382	disk->disk_name, bdev_size, disk_size);
	1383	}
	1384	i_size_write(bdev->bd_inode, disk_size);
	1385	if (bdev_size > disk_size)
	1386	flush_disk(bdev, false);
	1387	}
	1388	}
	1389
	1390	/**
	1391	* revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
	1392	* @disk: struct gendisk to be revalidated
	1393	*
	1394	* This routine is a wrapper for lower-level driver's revalidate_disk
	1395	* call-backs. It is used to do common pre and post operations needed
	1396	* for all revalidate_disk operations.
	1397	*/
	1398	int revalidate_disk(struct gendisk *disk)
	1399	{
	1400	struct block_device *bdev;
	1401	int ret = 0;
	1402
	1403	if (disk->fops->revalidate_disk)
	1404	ret = disk->fops->revalidate_disk(disk);
	1405	bdev = bdget_disk(disk, 0);
	1406	if (!bdev)
	1407	return ret;
	1408
	1409	mutex_lock(&bdev->bd_mutex);
	1410	check_disk_size_change(disk, bdev, ret == 0);
	1411	bdev->bd_invalidated = 0;
	1412	mutex_unlock(&bdev->bd_mutex);
	1413	bdput(bdev);
	1414	return ret;
	1415	}
	1416	EXPORT_SYMBOL(revalidate_disk);
	1417
	1418	/*
	1419	* This routine checks whether a removable media has been changed,
	1420	* and invalidates all buffer-cache-entries in that case. This
	1421	* is a relatively slow routine, so we have to try to minimize using
	1422	* it. Thus it is called only upon a 'mount' or 'open'. This
	1423	* is the best way of combining speed and utility, I think.
	1424	* People changing diskettes in the middle of an operation deserve
	1425	* to lose :-)
	1426	*/
	1427	int check_disk_change(struct block_device *bdev)
	1428	{
	1429	struct gendisk *disk = bdev->bd_disk;
	1430	const struct block_device_operations *bdops = disk->fops;
	1431	unsigned int events;
	1432
	1433	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE \|
	1434	DISK_EVENT_EJECT_REQUEST);
	1435	if (!(events & DISK_EVENT_MEDIA_CHANGE))
	1436	return 0;
	1437
	1438	flush_disk(bdev, true);
	1439	if (bdops->revalidate_disk)
	1440	bdops->revalidate_disk(bdev->bd_disk);
	1441	return 1;
	1442	}
	1443
	1444	EXPORT_SYMBOL(check_disk_change);
	1445
	1446	void bd_set_size(struct block_device *bdev, loff_t size)
	1447	{
	1448	inode_lock(bdev->bd_inode);
	1449	i_size_write(bdev->bd_inode, size);
	1450	inode_unlock(bdev->bd_inode);
	1451	}
	1452	EXPORT_SYMBOL(bd_set_size);
	1453
	1454	static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
	1455
	1456	/*
	1457	* bd_mutex locking:
	1458	*
	1459	* mutex_lock(part->bd_mutex)
	1460	* mutex_lock_nested(whole->bd_mutex, 1)
	1461	*/
	1462
	1463	static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
	1464	{
	1465	struct gendisk *disk;
	1466	int ret;
	1467	int partno;
	1468	int perm = 0;
	1469	bool first_open = false;
	1470
	1471	if (mode & FMODE_READ)
	1472	perm \|= MAY_READ;
	1473	if (mode & FMODE_WRITE)
	1474	perm \|= MAY_WRITE;
	1475	/*
	1476	* hooks: /n/, see "layering violations".
	1477	*/
	1478	if (!for_part) {
	1479	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
	1480	if (ret != 0) {
	1481	bdput(bdev);
	1482	return ret;
	1483	}
	1484	}
	1485
	1486	restart:
	1487
	1488	ret = -ENXIO;
	1489	disk = bdev_get_gendisk(bdev, &partno);
	1490	if (!disk)
	1491	goto out;
	1492
	1493	disk_block_events(disk);
	1494	mutex_lock_nested(&bdev->bd_mutex, for_part);
	1495	if (!bdev->bd_openers) {
	1496	first_open = true;
	1497	bdev->bd_disk = disk;
	1498	bdev->bd_queue = disk->queue;
	1499	bdev->bd_contains = bdev;
	1500	bdev->bd_partno = partno;
	1501
	1502	if (!partno) {
	1503	ret = -ENXIO;
	1504	bdev->bd_part = disk_get_part(disk, partno);
	1505	if (!bdev->bd_part)
	1506	goto out_clear;
	1507
	1508	ret = 0;
	1509	if (disk->fops->open) {
	1510	ret = disk->fops->open(bdev, mode);
	1511	if (ret == -ERESTARTSYS) {
	1512	/* Lost a race with 'disk' being
	1513	* deleted, try again.
	1514	* See md.c
	1515	*/
	1516	disk_put_part(bdev->bd_part);
	1517	bdev->bd_part = NULL;
	1518	bdev->bd_disk = NULL;
	1519	bdev->bd_queue = NULL;
	1520	mutex_unlock(&bdev->bd_mutex);
	1521	disk_unblock_events(disk);
	1522	put_disk_and_module(disk);
	1523	goto restart;
	1524	}
	1525	}
	1526
	1527	if (!ret) {
	1528	bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
	1529	set_init_blocksize(bdev);
	1530	}
	1531
	1532	/*
	1533	* If the device is invalidated, rescan partition
	1534	* if open succeeded or failed with -ENOMEDIUM.
	1535	* The latter is necessary to prevent ghost
	1536	* partitions on a removed medium.
	1537	*/
	1538	if (bdev->bd_invalidated) {
	1539	if (!ret)
	1540	rescan_partitions(disk, bdev);
	1541	else if (ret == -ENOMEDIUM)
	1542	invalidate_partitions(disk, bdev);
	1543	}
	1544
	1545	if (ret)
	1546	goto out_clear;
	1547	} else {
	1548	struct block_device *whole;
	1549	whole = bdget_disk(disk, 0);
	1550	ret = -ENOMEM;
	1551	if (!whole)
	1552	goto out_clear;
	1553	BUG_ON(for_part);
	1554	ret = __blkdev_get(whole, mode, 1);
	1555	if (ret)
	1556	goto out_clear;
	1557	bdev->bd_contains = whole;
	1558	bdev->bd_part = disk_get_part(disk, partno);
	1559	if (!(disk->flags & GENHD_FL_UP) \|\|
	1560	!bdev->bd_part \|\| !bdev->bd_part->nr_sects) {
	1561	ret = -ENXIO;
	1562	goto out_clear;
	1563	}
	1564	bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
	1565	set_init_blocksize(bdev);
	1566	}
	1567
	1568	if (bdev->bd_bdi == &noop_backing_dev_info)
	1569	bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
	1570	} else {
	1571	if (bdev->bd_contains == bdev) {
	1572	ret = 0;
	1573	if (bdev->bd_disk->fops->open)
	1574	ret = bdev->bd_disk->fops->open(bdev, mode);
	1575	/* the same as first opener case, read comment there */
	1576	if (bdev->bd_invalidated) {
	1577	if (!ret)
	1578	rescan_partitions(bdev->bd_disk, bdev);
	1579	else if (ret == -ENOMEDIUM)
	1580	invalidate_partitions(bdev->bd_disk, bdev);
	1581	}
	1582	if (ret)
	1583	goto out_unlock_bdev;
	1584	}
	1585	}
	1586	bdev->bd_openers++;
	1587	if (for_part)
	1588	bdev->bd_part_count++;
	1589	mutex_unlock(&bdev->bd_mutex);
	1590	disk_unblock_events(disk);
	1591	/* only one opener holds refs to the module and disk */
	1592	if (!first_open)
	1593	put_disk_and_module(disk);
	1594	return 0;
	1595
	1596	out_clear:
	1597	disk_put_part(bdev->bd_part);
	1598	bdev->bd_disk = NULL;
	1599	bdev->bd_part = NULL;
	1600	bdev->bd_queue = NULL;
	1601	if (bdev != bdev->bd_contains)
	1602	__blkdev_put(bdev->bd_contains, mode, 1);
	1603	bdev->bd_contains = NULL;
	1604	out_unlock_bdev:
	1605	mutex_unlock(&bdev->bd_mutex);
	1606	disk_unblock_events(disk);
	1607	put_disk_and_module(disk);
	1608	out:
	1609	bdput(bdev);
	1610
	1611	return ret;
	1612	}
	1613
	1614	/**
	1615	* blkdev_get - open a block device
	1616	* @bdev: block_device to open
	1617	* @mode: FMODE_* mask
	1618	* @holder: exclusive holder identifier
	1619	*
	1620	* Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
	1621	* open with exclusive access. Specifying %FMODE_EXCL with %NULL
	1622	* @holder is invalid. Exclusive opens may nest for the same @holder.
	1623	*
	1624	* On success, the reference count of @bdev is unchanged. On failure,
	1625	* @bdev is put.
	1626	*
	1627	* CONTEXT:
	1628	* Might sleep.
	1629	*
	1630	* RETURNS:
	1631	* 0 on success, -errno on failure.
	1632	*/
	1633	int blkdev_get(struct block_device bdev, fmode_t mode, void holder)
	1634	{
	1635	struct block_device *whole = NULL;
	1636	int res;
	1637
	1638	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
	1639
	1640	if ((mode & FMODE_EXCL) && holder) {
	1641	whole = bd_start_claiming(bdev, holder);
	1642	if (IS_ERR(whole)) {
	1643	bdput(bdev);
	1644	return PTR_ERR(whole);
	1645	}
	1646	}
	1647
	1648	res = __blkdev_get(bdev, mode, 0);
	1649
	1650	if (whole) {
	1651	struct gendisk *disk = whole->bd_disk;
	1652
	1653	/* finish claiming */
	1654	mutex_lock(&bdev->bd_mutex);
	1655	spin_lock(&bdev_lock);
	1656
	1657	if (!res) {
	1658	BUG_ON(!bd_may_claim(bdev, whole, holder));
	1659	/*
	1660	* Note that for a whole device bd_holders
	1661	* will be incremented twice, and bd_holder
	1662	* will be set to bd_may_claim before being
	1663	* set to holder
	1664	*/
	1665	whole->bd_holders++;
	1666	whole->bd_holder = bd_may_claim;
	1667	bdev->bd_holders++;
	1668	bdev->bd_holder = holder;
	1669	}
	1670
	1671	/* tell others that we're done */
	1672	BUG_ON(whole->bd_claiming != holder);
	1673	whole->bd_claiming = NULL;
	1674	wake_up_bit(&whole->bd_claiming, 0);
	1675
	1676	spin_unlock(&bdev_lock);
	1677
	1678	/*
	1679	* Block event polling for write claims if requested. Any
	1680	* write holder makes the write_holder state stick until
	1681	* all are released. This is good enough and tracking
	1682	* individual writeable reference is too fragile given the
	1683	* way @mode is used in blkdev_get/put().
	1684	*/
	1685	if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
	1686	(disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
	1687	bdev->bd_write_holder = true;
	1688	disk_block_events(disk);
	1689	}
	1690
	1691	mutex_unlock(&bdev->bd_mutex);
	1692	bdput(whole);
	1693	}
	1694
	1695	return res;
	1696	}
	1697	EXPORT_SYMBOL(blkdev_get);
	1698
	1699	/**
	1700	* blkdev_get_by_path - open a block device by name
	1701	* @path: path to the block device to open
	1702	* @mode: FMODE_* mask
	1703	* @holder: exclusive holder identifier
	1704	*
	1705	* Open the blockdevice described by the device file at @path. @mode
	1706	* and @holder are identical to blkdev_get().
	1707	*
	1708	* On success, the returned block_device has reference count of one.
	1709	*
	1710	* CONTEXT:
	1711	* Might sleep.
	1712	*
	1713	* RETURNS:
	1714	* Pointer to block_device on success, ERR_PTR(-errno) on failure.
	1715	*/
	1716	struct block_device blkdev_get_by_path(const char path, fmode_t mode,
	1717	void *holder)
	1718	{
	1719	struct block_device *bdev;
	1720	int err;
	1721
	1722	bdev = lookup_bdev(path);
	1723	if (IS_ERR(bdev))
	1724	return bdev;
	1725
	1726	err = blkdev_get(bdev, mode, holder);
	1727	if (err)
	1728	return ERR_PTR(err);
	1729
	1730	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
	1731	blkdev_put(bdev, mode);
	1732	return ERR_PTR(-EACCES);
	1733	}
	1734
	1735	return bdev;
	1736	}
	1737	EXPORT_SYMBOL(blkdev_get_by_path);
	1738
	1739	/**
	1740	* blkdev_get_by_dev - open a block device by device number
	1741	* @dev: device number of block device to open
	1742	* @mode: FMODE_* mask
	1743	* @holder: exclusive holder identifier
	1744	*
	1745	* Open the blockdevice described by device number @dev. @mode and
	1746	* @holder are identical to blkdev_get().
	1747	*
	1748	* Use it ONLY if you really do not have anything better - i.e. when
	1749	* you are behind a truly sucky interface and all you are given is a
	1750	* device number. _Never_ to be used for internal purposes. If you
	1751	* ever need it - reconsider your API.
	1752	*
	1753	* On success, the returned block_device has reference count of one.
	1754	*
	1755	* CONTEXT:
	1756	* Might sleep.
	1757	*
	1758	* RETURNS:
	1759	* Pointer to block_device on success, ERR_PTR(-errno) on failure.
	1760	*/
	1761	struct block_device blkdev_get_by_dev(dev_t dev, fmode_t mode, void holder)
	1762	{
	1763	struct block_device *bdev;
	1764	int err;
	1765
	1766	bdev = bdget(dev);
	1767	if (!bdev)
	1768	return ERR_PTR(-ENOMEM);
	1769
	1770	err = blkdev_get(bdev, mode, holder);
	1771	if (err)
	1772	return ERR_PTR(err);
	1773
	1774	return bdev;
	1775	}
	1776	EXPORT_SYMBOL(blkdev_get_by_dev);
	1777
	1778	static int blkdev_open(struct inode * inode, struct file * filp)
	1779	{
	1780	struct block_device *bdev;
	1781
	1782	/*
	1783	* Preserve backwards compatibility and allow large file access
	1784	* even if userspace doesn't ask for it explicitly. Some mkfs
	1785	* binary needs it. We might want to drop this workaround
	1786	* during an unstable branch.
	1787	*/
	1788	filp->f_flags \|= O_LARGEFILE;
	1789
	1790	filp->f_mode \|= FMODE_NOWAIT;
	1791
	1792	if (filp->f_flags & O_NDELAY)
	1793	filp->f_mode \|= FMODE_NDELAY;
	1794	if (filp->f_flags & O_EXCL)
	1795	filp->f_mode \|= FMODE_EXCL;
	1796	if ((filp->f_flags & O_ACCMODE) == 3)
	1797	filp->f_mode \|= FMODE_WRITE_IOCTL;
	1798
	1799	bdev = bd_acquire(inode);
	1800	if (bdev == NULL)
	1801	return -ENOMEM;
	1802
	1803	filp->f_mapping = bdev->bd_inode->i_mapping;
	1804	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
	1805
	1806	return blkdev_get(bdev, filp->f_mode, filp);
	1807	}
	1808
	1809	static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
	1810	{
	1811	struct gendisk *disk = bdev->bd_disk;
	1812	struct block_device *victim = NULL;
	1813
	1814	mutex_lock_nested(&bdev->bd_mutex, for_part);
	1815	if (for_part)
	1816	bdev->bd_part_count--;
	1817
	1818	if (!--bdev->bd_openers) {
	1819	WARN_ON_ONCE(bdev->bd_holders);
	1820	sync_blockdev(bdev);
	1821	kill_bdev(bdev);
	1822
	1823	bdev_write_inode(bdev);
	1824	}
	1825	if (bdev->bd_contains == bdev) {
	1826	if (disk->fops->release)
	1827	disk->fops->release(disk, mode);
	1828	}
	1829	if (!bdev->bd_openers) {
	1830	disk_put_part(bdev->bd_part);
	1831	bdev->bd_part = NULL;
	1832	bdev->bd_disk = NULL;
	1833	if (bdev != bdev->bd_contains)
	1834	victim = bdev->bd_contains;
	1835	bdev->bd_contains = NULL;
	1836
	1837	put_disk_and_module(disk);
	1838	}
	1839	mutex_unlock(&bdev->bd_mutex);
	1840	bdput(bdev);
	1841	if (victim)
	1842	__blkdev_put(victim, mode, 1);
	1843	}
	1844
	1845	void blkdev_put(struct block_device *bdev, fmode_t mode)
	1846	{
	1847	mutex_lock(&bdev->bd_mutex);
	1848
	1849	if (mode & FMODE_EXCL) {
	1850	bool bdev_free;
	1851
	1852	/*
	1853	* Release a claim on the device. The holder fields
	1854	* are protected with bdev_lock. bd_mutex is to
	1855	* synchronize disk_holder unlinking.
	1856	*/
	1857	spin_lock(&bdev_lock);
	1858
	1859	WARN_ON_ONCE(--bdev->bd_holders < 0);
	1860	WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
	1861
	1862	/* bd_contains might point to self, check in a separate step */
	1863	if ((bdev_free = !bdev->bd_holders))
	1864	bdev->bd_holder = NULL;
	1865	if (!bdev->bd_contains->bd_holders)
	1866	bdev->bd_contains->bd_holder = NULL;
	1867
	1868	spin_unlock(&bdev_lock);
	1869
	1870	/*
	1871	* If this was the last claim, remove holder link and
	1872	* unblock evpoll if it was a write holder.
	1873	*/
	1874	if (bdev_free && bdev->bd_write_holder) {
	1875	disk_unblock_events(bdev->bd_disk);
	1876	bdev->bd_write_holder = false;
	1877	}
	1878	}
	1879
	1880	/*
	1881	* Trigger event checking and tell drivers to flush MEDIA_CHANGE
	1882	* event. This is to ensure detection of media removal commanded
	1883	* from userland - e.g. eject(1).
	1884	*/
	1885	disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
	1886
	1887	mutex_unlock(&bdev->bd_mutex);
	1888
	1889	__blkdev_put(bdev, mode, 0);
	1890	}
	1891	EXPORT_SYMBOL(blkdev_put);
	1892
	1893	static int blkdev_close(struct inode * inode, struct file * filp)
	1894	{
	1895	struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
	1896	blkdev_put(bdev, filp->f_mode);
	1897	return 0;
	1898	}
	1899
	1900	static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
	1901	{
	1902	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
	1903	fmode_t mode = file->f_mode;
	1904
	1905	/*
	1906	* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
	1907	* to updated it before every ioctl.
	1908	*/
	1909	if (file->f_flags & O_NDELAY)
	1910	mode \|= FMODE_NDELAY;
	1911	else
	1912	mode &= ~FMODE_NDELAY;
	1913
	1914	return blkdev_ioctl(bdev, mode, cmd, arg);
	1915	}
	1916
	1917	/*
	1918	* Write data to the block device. Only intended for the block device itself
	1919	* and the raw driver which basically is a fake block device.
	1920	*
	1921	* Does not take i_mutex for the write and thus is not for general purpose
	1922	* use.
	1923	*/
	1924	ssize_t blkdev_write_iter(struct kiocb iocb, struct iov_iter from)
	1925	{
	1926	struct file *file = iocb->ki_filp;
	1927	struct inode *bd_inode = bdev_file_inode(file);
	1928	loff_t size = i_size_read(bd_inode);
	1929	struct blk_plug plug;
	1930	ssize_t ret;
	1931
	1932	if (bdev_read_only(I_BDEV(bd_inode)))
	1933	return -EPERM;
	1934
	1935	if (!iov_iter_count(from))
	1936	return 0;
	1937
	1938	if (iocb->ki_pos >= size)
	1939	return -ENOSPC;
	1940
	1941	if ((iocb->ki_flags & (IOCB_NOWAIT \| IOCB_DIRECT)) == IOCB_NOWAIT)
	1942	return -EOPNOTSUPP;
	1943
	1944	iov_iter_truncate(from, size - iocb->ki_pos);
	1945
	1946	blk_start_plug(&plug);
	1947	ret = __generic_file_write_iter(iocb, from);
	1948	if (ret > 0)
	1949	ret = generic_write_sync(iocb, ret);
	1950	blk_finish_plug(&plug);
	1951	return ret;
	1952	}
	1953	EXPORT_SYMBOL_GPL(blkdev_write_iter);
	1954
	1955	ssize_t blkdev_read_iter(struct kiocb iocb, struct iov_iter to)
	1956	{
	1957	struct file *file = iocb->ki_filp;
	1958	struct inode *bd_inode = bdev_file_inode(file);
	1959	loff_t size = i_size_read(bd_inode);
	1960	loff_t pos = iocb->ki_pos;
	1961
	1962	if (pos >= size)
	1963	return 0;
	1964
	1965	size -= pos;
	1966	iov_iter_truncate(to, size);
	1967	return generic_file_read_iter(iocb, to);
	1968	}
	1969	EXPORT_SYMBOL_GPL(blkdev_read_iter);
	1970
	1971	/*
	1972	* Try to release a page associated with block device when the system
	1973	* is under memory pressure.
	1974	*/
	1975	static int blkdev_releasepage(struct page *page, gfp_t wait)
	1976	{
	1977	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
	1978
	1979	if (super && super->s_op->bdev_try_to_free_page)
	1980	return super->s_op->bdev_try_to_free_page(super, page, wait);
	1981
	1982	return try_to_free_buffers(page);
	1983	}
	1984
	1985	static int blkdev_writepages(struct address_space *mapping,
	1986	struct writeback_control *wbc)
	1987	{
	1988	return generic_writepages(mapping, wbc);
	1989	}
	1990
	1991	static const struct address_space_operations def_blk_aops = {
	1992	.readpage = blkdev_readpage,
	1993	.readpages = blkdev_readpages,
	1994	.writepage = blkdev_writepage,
	1995	.write_begin = blkdev_write_begin,
	1996	.write_end = blkdev_write_end,
	1997	.writepages = blkdev_writepages,
	1998	.releasepage = blkdev_releasepage,
	1999	.direct_IO = blkdev_direct_IO,
	2000	.migratepage = buffer_migrate_page_norefs,
	2001	.is_dirty_writeback = buffer_check_dirty_writeback,
	2002	};
	2003
	2004	#define BLKDEV_FALLOC_FL_SUPPORTED \
	2005	(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE \| \
	2006	FALLOC_FL_ZERO_RANGE \| FALLOC_FL_NO_HIDE_STALE)
	2007
	2008	static long blkdev_fallocate(struct file *file, int mode, loff_t start,
	2009	loff_t len)
	2010	{
	2011	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
	2012	struct address_space *mapping;
	2013	loff_t end = start + len - 1;
	2014	loff_t isize;
	2015	int error;
	2016
	2017	/* Fail if we don't recognize the flags. */
	2018	if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
	2019	return -EOPNOTSUPP;
	2020
	2021	/* Don't go off the end of the device. */
	2022	isize = i_size_read(bdev->bd_inode);
	2023	if (start >= isize)
	2024	return -EINVAL;
	2025	if (end >= isize) {
	2026	if (mode & FALLOC_FL_KEEP_SIZE) {
	2027	len = isize - start;
	2028	end = start + len - 1;
	2029	} else
	2030	return -EINVAL;
	2031	}
	2032
	2033	/*
	2034	* Don't allow IO that isn't aligned to logical block size.
	2035	*/
	2036	if ((start \| len) & (bdev_logical_block_size(bdev) - 1))
	2037	return -EINVAL;
	2038
	2039	/* Invalidate the page cache, including dirty pages. */
	2040	mapping = bdev->bd_inode->i_mapping;
	2041	truncate_inode_pages_range(mapping, start, end);
	2042
	2043	switch (mode) {
	2044	case FALLOC_FL_ZERO_RANGE:
	2045	case FALLOC_FL_ZERO_RANGE \| FALLOC_FL_KEEP_SIZE:
	2046	error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
	2047	GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
	2048	break;
	2049	case FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE:
	2050	error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
	2051	GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
	2052	break;
	2053	case FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE \| FALLOC_FL_NO_HIDE_STALE:
	2054	error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
	2055	GFP_KERNEL, 0);
	2056	break;
	2057	default:
	2058	return -EOPNOTSUPP;
	2059	}
	2060	if (error)
	2061	return error;
	2062
	2063	/*
	2064	* Invalidate again; if someone wandered in and dirtied a page,
	2065	* the caller will be given -EBUSY. The third argument is
	2066	* inclusive, so the rounding here is safe.
	2067	*/
	2068	return invalidate_inode_pages2_range(mapping,
	2069	start >> PAGE_SHIFT,
	2070	end >> PAGE_SHIFT);
	2071	}
	2072
	2073	const struct file_operations def_blk_fops = {
	2074	.open = blkdev_open,
	2075	.release = blkdev_close,
	2076	.llseek = block_llseek,
	2077	.read_iter = blkdev_read_iter,
	2078	.write_iter = blkdev_write_iter,
	2079	.mmap = generic_file_mmap,
	2080	.fsync = blkdev_fsync,
	2081	.unlocked_ioctl = block_ioctl,
	2082	#ifdef CONFIG_COMPAT
	2083	.compat_ioctl = compat_blkdev_ioctl,
	2084	#endif
	2085	.splice_read = generic_file_splice_read,
	2086	.splice_write = iter_file_splice_write,
	2087	.fallocate = blkdev_fallocate,
	2088	};
	2089
	2090	int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
	2091	{
	2092	int res;
	2093	mm_segment_t old_fs = get_fs();
	2094	set_fs(KERNEL_DS);
	2095	res = blkdev_ioctl(bdev, 0, cmd, arg);
	2096	set_fs(old_fs);
	2097	return res;
	2098	}
	2099
	2100	EXPORT_SYMBOL(ioctl_by_bdev);
	2101
	2102	/**
	2103	* lookup_bdev - lookup a struct block_device by name
	2104	* @pathname: special file representing the block device
	2105	*
	2106	* Get a reference to the blockdevice at @pathname in the current
	2107	* namespace if possible and return it. Return ERR_PTR(error)
	2108	* otherwise.
	2109	*/
	2110	struct block_device lookup_bdev(const char pathname)
	2111	{
	2112	struct block_device *bdev;
	2113	struct inode *inode;
	2114	struct path path;
	2115	int error;
	2116
	2117	if (!pathname \|\| !*pathname)
	2118	return ERR_PTR(-EINVAL);
	2119
	2120	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
	2121	if (error)
	2122	return ERR_PTR(error);
	2123
	2124	inode = d_backing_inode(path.dentry);
	2125	error = -ENOTBLK;
	2126	if (!S_ISBLK(inode->i_mode))
	2127	goto fail;
	2128	error = -EACCES;
	2129	if (!may_open_dev(&path))
	2130	goto fail;
	2131	error = -ENOMEM;
	2132	bdev = bd_acquire(inode);
	2133	if (!bdev)
	2134	goto fail;
	2135	out:
	2136	path_put(&path);
	2137	return bdev;
	2138	fail:
	2139	bdev = ERR_PTR(error);
	2140	goto out;
	2141	}
	2142	EXPORT_SYMBOL(lookup_bdev);
	2143
	2144	int __invalidate_device(struct block_device *bdev, bool kill_dirty)
	2145	{
	2146	struct super_block *sb = get_super(bdev);
	2147	int res = 0;
	2148
	2149	if (sb) {
	2150	/*
	2151	* no need to lock the super, get_super holds the
	2152	* read mutex so the filesystem cannot go away
	2153	* under us (->put_super runs with the write lock
	2154	* hold).
	2155	*/
	2156	shrink_dcache_sb(sb);
	2157	res = invalidate_inodes(sb, kill_dirty);
	2158	drop_super(sb);
	2159	}
	2160	invalidate_bdev(bdev);
	2161	return res;
	2162	}
	2163	EXPORT_SYMBOL(__invalidate_device);
	2164
	2165	void iterate_bdevs(void (func)(struct block_device , void ), void arg)
	2166	{
	2167	struct inode inode, old_inode = NULL;
	2168
	2169	spin_lock(&blockdev_superblock->s_inode_list_lock);
	2170	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
	2171	struct address_space *mapping = inode->i_mapping;
	2172	struct block_device *bdev;
	2173
	2174	spin_lock(&inode->i_lock);
	2175	if (inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW) \|\|
	2176	mapping->nrpages == 0) {
	2177	spin_unlock(&inode->i_lock);
	2178	continue;
	2179	}
	2180	__iget(inode);
	2181	spin_unlock(&inode->i_lock);
	2182	spin_unlock(&blockdev_superblock->s_inode_list_lock);
	2183	/*
	2184	* We hold a reference to 'inode' so it couldn't have been
	2185	* removed from s_inodes list while we dropped the
	2186	* s_inode_list_lock We cannot iput the inode now as we can
	2187	* be holding the last reference and we cannot iput it under
	2188	* s_inode_list_lock. So we keep the reference and iput it
	2189	* later.
	2190	*/
	2191	iput(old_inode);
	2192	old_inode = inode;
	2193	bdev = I_BDEV(inode);
	2194
	2195	mutex_lock(&bdev->bd_mutex);
	2196	if (bdev->bd_openers)
	2197	func(bdev, arg);
	2198	mutex_unlock(&bdev->bd_mutex);
	2199
	2200	spin_lock(&blockdev_superblock->s_inode_list_lock);
	2201	}
	2202	spin_unlock(&blockdev_superblock->s_inode_list_lock);
	2203	iput(old_inode);
	2204	}