Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* linux/fs/block_dev.c
	3	*
	4	* Copyright (C) 1991, 1992 Linus Torvalds
	5	* Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE
	6	*/
	7
	8	#include <linux/init.h>
	9	#include <linux/mm.h>
	10	#include <linux/fcntl.h>
	11	#include <linux/slab.h>
	12	#include <linux/kmod.h>
	13	#include <linux/major.h>
	14	#include <linux/device_cgroup.h>
	15	#include <linux/highmem.h>
	16	#include <linux/blkdev.h>
	17	#include <linux/module.h>
	18	#include <linux/blkpg.h>
	19	#include <linux/buffer_head.h>
	20	#include <linux/pagevec.h>
	21	#include <linux/writeback.h>
	22	#include <linux/mpage.h>
	23	#include <linux/mount.h>
	24	#include <linux/uio.h>
	25	#include <linux/namei.h>
	26	#include <linux/log2.h>
	27	#include <linux/kmemleak.h>
	28	#include <asm/uaccess.h>
	29	#include "internal.h"
	30
	31	struct bdev_inode {
	32	struct block_device bdev;
	33	struct inode vfs_inode;
	34	};
	35
	36	static const struct address_space_operations def_blk_aops;
	37
	38	static inline struct bdev_inode BDEV_I(struct inode inode)
	39	{
	40	return container_of(inode, struct bdev_inode, vfs_inode);
	41	}
	42
	43	inline struct block_device I_BDEV(struct inode inode)
	44	{
	45	return &BDEV_I(inode)->bdev;
	46	}
	47
	48	EXPORT_SYMBOL(I_BDEV);
	49
	50	/*
	51	* move the inode from it's current bdi to the a new bdi. if the inode is dirty
	52	* we need to move it onto the dirty list of @dst so that the inode is always
	53	* on the right list.
	54	*/
	55	static void bdev_inode_switch_bdi(struct inode *inode,
	56	struct backing_dev_info *dst)
	57	{
	58	spin_lock(&inode_lock);
	59	inode->i_data.backing_dev_info = dst;
	60	if (inode->i_state & I_DIRTY)
	61	list_move(&inode->i_wb_list, &dst->wb.b_dirty);
	62	spin_unlock(&inode_lock);
	63	}
	64
	65	static sector_t max_block(struct block_device *bdev)
	66	{
	67	sector_t retval = ~((sector_t)0);
	68	loff_t sz = i_size_read(bdev->bd_inode);
	69
	70	if (sz) {
	71	unsigned int size = block_size(bdev);
	72	unsigned int sizebits = blksize_bits(size);
	73	retval = (sz >> sizebits);
	74	}
	75	return retval;
	76	}
	77
	78	/* Kill _all_ buffers and pagecache , dirty or not.. */
	79	static void kill_bdev(struct block_device *bdev)
	80	{
	81	if (bdev->bd_inode->i_mapping->nrpages == 0)
	82	return;
	83	invalidate_bh_lrus();
	84	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
	85	}
	86
	87	int set_blocksize(struct block_device *bdev, int size)
	88	{
	89	/* Size must be a power of two, and between 512 and PAGE_SIZE */
	90	if (size > PAGE_SIZE \|\| size < 512 \|\| !is_power_of_2(size))
	91	return -EINVAL;
	92
	93	/* Size cannot be smaller than the size supported by the device */
	94	if (size < bdev_logical_block_size(bdev))
	95	return -EINVAL;
	96
	97	/* Don't change the size if it is same as current */
	98	if (bdev->bd_block_size != size) {
	99	sync_blockdev(bdev);
	100	bdev->bd_block_size = size;
	101	bdev->bd_inode->i_blkbits = blksize_bits(size);
	102	kill_bdev(bdev);
	103	}
	104	return 0;
	105	}
	106
	107	EXPORT_SYMBOL(set_blocksize);
	108
	109	int sb_set_blocksize(struct super_block *sb, int size)
	110	{
	111	if (set_blocksize(sb->s_bdev, size))
	112	return 0;
	113	/* If we get here, we know size is power of two
	114	* and it's value is between 512 and PAGE_SIZE */
	115	sb->s_blocksize = size;
	116	sb->s_blocksize_bits = blksize_bits(size);
	117	return sb->s_blocksize;
	118	}
	119
	120	EXPORT_SYMBOL(sb_set_blocksize);
	121
	122	int sb_min_blocksize(struct super_block *sb, int size)
	123	{
	124	int minsize = bdev_logical_block_size(sb->s_bdev);
	125	if (size < minsize)
	126	size = minsize;
	127	return sb_set_blocksize(sb, size);
	128	}
	129
	130	EXPORT_SYMBOL(sb_min_blocksize);
	131
	132	static int
	133	blkdev_get_block(struct inode *inode, sector_t iblock,
	134	struct buffer_head *bh, int create)
	135	{
	136	if (iblock >= max_block(I_BDEV(inode))) {
	137	if (create)
	138	return -EIO;
	139
	140	/*
	141	* for reads, we're just trying to fill a partial page.
	142	* return a hole, they will have to call get_block again
	143	* before they can fill it, and they will get -EIO at that
	144	* time
	145	*/
	146	return 0;
	147	}
	148	bh->b_bdev = I_BDEV(inode);
	149	bh->b_blocknr = iblock;
	150	set_buffer_mapped(bh);
	151	return 0;
	152	}
	153
	154	static int
	155	blkdev_get_blocks(struct inode *inode, sector_t iblock,
	156	struct buffer_head *bh, int create)
	157	{
	158	sector_t end_block = max_block(I_BDEV(inode));
	159	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
	160
	161	if ((iblock + max_blocks) > end_block) {
	162	max_blocks = end_block - iblock;
	163	if ((long)max_blocks <= 0) {
	164	if (create)
	165	return -EIO; /* write fully beyond EOF */
	166	/*
	167	* It is a read which is fully beyond EOF. We return
	168	* a !buffer_mapped buffer
	169	*/
	170	max_blocks = 0;
	171	}
	172	}
	173
	174	bh->b_bdev = I_BDEV(inode);
	175	bh->b_blocknr = iblock;
	176	bh->b_size = max_blocks << inode->i_blkbits;
	177	if (max_blocks)
	178	set_buffer_mapped(bh);
	179	return 0;
	180	}
	181
	182	static ssize_t
	183	blkdev_direct_IO(int rw, struct kiocb iocb, const struct iovec iov,
	184	loff_t offset, unsigned long nr_segs)
	185	{
	186	struct file *file = iocb->ki_filp;
	187	struct inode *inode = file->f_mapping->host;
	188
	189	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
	190	nr_segs, blkdev_get_blocks, NULL, NULL, 0);
	191	}
	192
	193	int __sync_blockdev(struct block_device *bdev, int wait)
	194	{
	195	if (!bdev)
	196	return 0;
	197	if (!wait)
	198	return filemap_flush(bdev->bd_inode->i_mapping);
	199	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
	200	}
	201
	202	/*
	203	* Write out and wait upon all the dirty data associated with a block
	204	* device via its mapping. Does not take the superblock lock.
	205	*/
	206	int sync_blockdev(struct block_device *bdev)
	207	{
	208	return __sync_blockdev(bdev, 1);
	209	}
	210	EXPORT_SYMBOL(sync_blockdev);
	211
	212	/*
	213	* Write out and wait upon all dirty data associated with this
	214	* device. Filesystem data as well as the underlying block
	215	* device. Takes the superblock lock.
	216	*/
	217	int fsync_bdev(struct block_device *bdev)
	218	{
	219	struct super_block *sb = get_super(bdev);
	220	if (sb) {
	221	int res = sync_filesystem(sb);
	222	drop_super(sb);
	223	return res;
	224	}
	225	return sync_blockdev(bdev);
	226	}
	227	EXPORT_SYMBOL(fsync_bdev);
	228
	229	/**
	230	* freeze_bdev -- lock a filesystem and force it into a consistent state
	231	* @bdev: blockdevice to lock
	232	*
	233	* If a superblock is found on this device, we take the s_umount semaphore
	234	* on it to make sure nobody unmounts until the snapshot creation is done.
	235	* The reference counter (bd_fsfreeze_count) guarantees that only the last
	236	* unfreeze process can unfreeze the frozen filesystem actually when multiple
	237	* freeze requests arrive simultaneously. It counts up in freeze_bdev() and
	238	* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
	239	* actually.
	240	*/
	241	struct super_block freeze_bdev(struct block_device bdev)
	242	{
	243	struct super_block *sb;
	244	int error = 0;
	245
	246	mutex_lock(&bdev->bd_fsfreeze_mutex);
	247	if (++bdev->bd_fsfreeze_count > 1) {
	248	/*
	249	* We don't even need to grab a reference - the first call
	250	* to freeze_bdev grab an active reference and only the last
	251	* thaw_bdev drops it.
	252	*/
	253	sb = get_super(bdev);
	254	drop_super(sb);
	255	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	256	return sb;
	257	}
	258
	259	sb = get_active_super(bdev);
	260	if (!sb)
	261	goto out;
	262	error = freeze_super(sb);
	263	if (error) {
	264	deactivate_super(sb);
	265	bdev->bd_fsfreeze_count--;
	266	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	267	return ERR_PTR(error);
	268	}
	269	deactivate_super(sb);
	270	out:
	271	sync_blockdev(bdev);
	272	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	273	return sb; /* thaw_bdev releases s->s_umount */
	274	}
	275	EXPORT_SYMBOL(freeze_bdev);
	276
	277	/**
	278	* thaw_bdev -- unlock filesystem
	279	* @bdev: blockdevice to unlock
	280	* @sb: associated superblock
	281	*
	282	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
	283	*/
	284	int thaw_bdev(struct block_device bdev, struct super_block sb)
	285	{
	286	int error = -EINVAL;
	287
	288	mutex_lock(&bdev->bd_fsfreeze_mutex);
	289	if (!bdev->bd_fsfreeze_count)
	290	goto out;
	291
	292	error = 0;
	293	if (--bdev->bd_fsfreeze_count > 0)
	294	goto out;
	295
	296	if (!sb)
	297	goto out;
	298
	299	error = thaw_super(sb);
	300	if (error) {
	301	bdev->bd_fsfreeze_count++;
	302	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	303	return error;
	304	}
	305	out:
	306	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	307	return 0;
	308	}
	309	EXPORT_SYMBOL(thaw_bdev);
	310
	311	static int blkdev_writepage(struct page page, struct writeback_control wbc)
	312	{
	313	return block_write_full_page(page, blkdev_get_block, wbc);
	314	}
	315
	316	static int blkdev_readpage(struct file * file, struct page * page)
	317	{
	318	return block_read_full_page(page, blkdev_get_block);
	319	}
	320
	321	static int blkdev_write_begin(struct file file, struct address_space mapping,
	322	loff_t pos, unsigned len, unsigned flags,
	323	struct page pagep, void fsdata)
	324	{
	325	return block_write_begin(mapping, pos, len, flags, pagep,
	326	blkdev_get_block);
	327	}
	328
	329	static int blkdev_write_end(struct file file, struct address_space mapping,
	330	loff_t pos, unsigned len, unsigned copied,
	331	struct page page, void fsdata)
	332	{
	333	int ret;
	334	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
	335
	336	unlock_page(page);
	337	page_cache_release(page);
	338
	339	return ret;
	340	}
	341
	342	/*
	343	* private llseek:
	344	* for a block special file file->f_path.dentry->d_inode->i_size is zero
	345	* so we compute the size by hand (just as in block_read/write above)
	346	*/
	347	static loff_t block_llseek(struct file *file, loff_t offset, int origin)
	348	{
	349	struct inode *bd_inode = file->f_mapping->host;
	350	loff_t size;
	351	loff_t retval;
	352
	353	mutex_lock(&bd_inode->i_mutex);
	354	size = i_size_read(bd_inode);
	355
	356	switch (origin) {
	357	case 2:
	358	offset += size;
	359	break;
	360	case 1:
	361	offset += file->f_pos;
	362	}
	363	retval = -EINVAL;
	364	if (offset >= 0 && offset <= size) {
	365	if (offset != file->f_pos) {
	366	file->f_pos = offset;
	367	}
	368	retval = offset;
	369	}
	370	mutex_unlock(&bd_inode->i_mutex);
	371	return retval;
	372	}
	373
	374	int blkdev_fsync(struct file *filp, int datasync)
	375	{
	376	struct inode *bd_inode = filp->f_mapping->host;
	377	struct block_device *bdev = I_BDEV(bd_inode);
	378	int error;
	379
	380	/*
	381	* There is no need to serialise calls to blkdev_issue_flush with
	382	* i_mutex and doing so causes performance issues with concurrent
	383	* O_SYNC writers to a block device.
	384	*/
	385	mutex_unlock(&bd_inode->i_mutex);
	386
	387	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
	388	if (error == -EOPNOTSUPP)
	389	error = 0;
	390
	391	mutex_lock(&bd_inode->i_mutex);
	392
	393	return error;
	394	}
	395	EXPORT_SYMBOL(blkdev_fsync);
	396
	397	/*
	398	* pseudo-fs
	399	*/
	400
	401	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
	402	static struct kmem_cache * bdev_cachep __read_mostly;
	403
	404	static struct inode bdev_alloc_inode(struct super_block sb)
	405	{
	406	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
	407	if (!ei)
	408	return NULL;
	409	return &ei->vfs_inode;
	410	}
	411
	412	static void bdev_i_callback(struct rcu_head *head)
	413	{
	414	struct inode *inode = container_of(head, struct inode, i_rcu);
	415	struct bdev_inode *bdi = BDEV_I(inode);
	416
	417	INIT_LIST_HEAD(&inode->i_dentry);
	418	kmem_cache_free(bdev_cachep, bdi);
	419	}
	420
	421	static void bdev_destroy_inode(struct inode *inode)
	422	{
	423	call_rcu(&inode->i_rcu, bdev_i_callback);
	424	}
	425
	426	static void init_once(void *foo)
	427	{
	428	struct bdev_inode ei = (struct bdev_inode ) foo;
	429	struct block_device *bdev = &ei->bdev;
	430
	431	memset(bdev, 0, sizeof(*bdev));
	432	mutex_init(&bdev->bd_mutex);
	433	INIT_LIST_HEAD(&bdev->bd_inodes);
	434	INIT_LIST_HEAD(&bdev->bd_list);
	435	inode_init_once(&ei->vfs_inode);
	436	/* Initialize mutex for freeze. */
	437	mutex_init(&bdev->bd_fsfreeze_mutex);
	438	}
	439
	440	static inline void __bd_forget(struct inode *inode)
	441	{
	442	list_del_init(&inode->i_devices);
	443	inode->i_bdev = NULL;
	444	inode->i_mapping = &inode->i_data;
	445	}
	446
	447	static void bdev_evict_inode(struct inode *inode)
	448	{
	449	struct block_device *bdev = &BDEV_I(inode)->bdev;
	450	struct list_head *p;
	451	truncate_inode_pages(&inode->i_data, 0);
	452	invalidate_inode_buffers(inode); /* is it needed here? */
	453	end_writeback(inode);
	454	spin_lock(&bdev_lock);
	455	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
	456	__bd_forget(list_entry(p, struct inode, i_devices));
	457	}
	458	list_del_init(&bdev->bd_list);
	459	spin_unlock(&bdev_lock);
	460	}
	461
	462	static const struct super_operations bdev_sops = {
	463	.statfs = simple_statfs,
	464	.alloc_inode = bdev_alloc_inode,
	465	.destroy_inode = bdev_destroy_inode,
	466	.drop_inode = generic_delete_inode,
	467	.evict_inode = bdev_evict_inode,
	468	};
	469
	470	static struct dentry bd_mount(struct file_system_type fs_type,
	471	int flags, const char dev_name, void data)
	472	{
	473	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
	474	}
	475
	476	static struct file_system_type bd_type = {
	477	.name = "bdev",
	478	.mount = bd_mount,
	479	.kill_sb = kill_anon_super,
	480	};
	481
	482	struct super_block *blockdev_superblock __read_mostly;
	483
	484	void __init bdev_cache_init(void)
	485	{
	486	int err;
	487	struct vfsmount *bd_mnt;
	488
	489	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
	490	0, (SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|
	491	SLAB_MEM_SPREAD\|SLAB_PANIC),
	492	init_once);
	493	err = register_filesystem(&bd_type);
	494	if (err)
	495	panic("Cannot register bdev pseudo-fs");
	496	bd_mnt = kern_mount(&bd_type);
	497	if (IS_ERR(bd_mnt))
	498	panic("Cannot create bdev pseudo-fs");
	499	/*
	500	* This vfsmount structure is only used to obtain the
	501	* blockdev_superblock, so tell kmemleak not to report it.
	502	*/
	503	kmemleak_not_leak(bd_mnt);
	504	blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
	505	}
	506
	507	/*
	508	* Most likely _very_ bad one - but then it's hardly critical for small
	509	* /dev and can be fixed when somebody will need really large one.
	510	* Keep in mind that it will be fed through icache hash function too.
	511	*/
	512	static inline unsigned long hash(dev_t dev)
	513	{
	514	return MAJOR(dev)+MINOR(dev);
	515	}
	516
	517	static int bdev_test(struct inode inode, void data)
	518	{
	519	return BDEV_I(inode)->bdev.bd_dev == (dev_t )data;
	520	}
	521
	522	static int bdev_set(struct inode inode, void data)
	523	{
	524	BDEV_I(inode)->bdev.bd_dev = (dev_t )data;
	525	return 0;
	526	}
	527
	528	static LIST_HEAD(all_bdevs);
	529
	530	struct block_device *bdget(dev_t dev)
	531	{
	532	struct block_device *bdev;
	533	struct inode *inode;
	534
	535	inode = iget5_locked(blockdev_superblock, hash(dev),
	536	bdev_test, bdev_set, &dev);
	537
	538	if (!inode)
	539	return NULL;
	540
	541	bdev = &BDEV_I(inode)->bdev;
	542
	543	if (inode->i_state & I_NEW) {
	544	bdev->bd_contains = NULL;
	545	bdev->bd_inode = inode;
	546	bdev->bd_block_size = (1 << inode->i_blkbits);
	547	bdev->bd_part_count = 0;
	548	bdev->bd_invalidated = 0;
	549	inode->i_mode = S_IFBLK;
	550	inode->i_rdev = dev;
	551	inode->i_bdev = bdev;
	552	inode->i_data.a_ops = &def_blk_aops;
	553	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
	554	inode->i_data.backing_dev_info = &default_backing_dev_info;
	555	spin_lock(&bdev_lock);
	556	list_add(&bdev->bd_list, &all_bdevs);
	557	spin_unlock(&bdev_lock);
	558	unlock_new_inode(inode);
	559	}
	560	return bdev;
	561	}
	562
	563	EXPORT_SYMBOL(bdget);
	564
	565	/**
	566	* bdgrab -- Grab a reference to an already referenced block device
	567	* @bdev: Block device to grab a reference to.
	568	*/
	569	struct block_device bdgrab(struct block_device bdev)
	570	{
	571	ihold(bdev->bd_inode);
	572	return bdev;
	573	}
	574
	575	long nr_blockdev_pages(void)
	576	{
	577	struct block_device *bdev;
	578	long ret = 0;
	579	spin_lock(&bdev_lock);
	580	list_for_each_entry(bdev, &all_bdevs, bd_list) {
	581	ret += bdev->bd_inode->i_mapping->nrpages;
	582	}
	583	spin_unlock(&bdev_lock);
	584	return ret;
	585	}
	586
	587	void bdput(struct block_device *bdev)
	588	{
	589	iput(bdev->bd_inode);
	590	}
	591
	592	EXPORT_SYMBOL(bdput);
	593
	594	static struct block_device bd_acquire(struct inode inode)
	595	{
	596	struct block_device *bdev;
	597
	598	spin_lock(&bdev_lock);
	599	bdev = inode->i_bdev;
	600	if (bdev) {
	601	ihold(bdev->bd_inode);
	602	spin_unlock(&bdev_lock);
	603	return bdev;
	604	}
	605	spin_unlock(&bdev_lock);
	606
	607	bdev = bdget(inode->i_rdev);
	608	if (bdev) {
	609	spin_lock(&bdev_lock);
	610	if (!inode->i_bdev) {
	611	/*
	612	* We take an additional reference to bd_inode,
	613	* and it's released in clear_inode() of inode.
	614	* So, we can access it via ->i_mapping always
	615	* without igrab().
	616	*/
	617	ihold(bdev->bd_inode);
	618	inode->i_bdev = bdev;
	619	inode->i_mapping = bdev->bd_inode->i_mapping;
	620	list_add(&inode->i_devices, &bdev->bd_inodes);
	621	}
	622	spin_unlock(&bdev_lock);
	623	}
	624	return bdev;
	625	}
	626
	627	/* Call when you free inode */
	628
	629	void bd_forget(struct inode *inode)
	630	{
	631	struct block_device *bdev = NULL;
	632
	633	spin_lock(&bdev_lock);
	634	if (inode->i_bdev) {
	635	if (!sb_is_blkdev_sb(inode->i_sb))
	636	bdev = inode->i_bdev;
	637	__bd_forget(inode);
	638	}
	639	spin_unlock(&bdev_lock);
	640
	641	if (bdev)
	642	iput(bdev->bd_inode);
	643	}
	644
	645	/**
	646	* bd_may_claim - test whether a block device can be claimed
	647	* @bdev: block device of interest
	648	* @whole: whole block device containing @bdev, may equal @bdev
	649	* @holder: holder trying to claim @bdev
	650	*
	651	* Test whther @bdev can be claimed by @holder.
	652	*
	653	* CONTEXT:
	654	* spin_lock(&bdev_lock).
	655	*
	656	* RETURNS:
	657	* %true if @bdev can be claimed, %false otherwise.
	658	*/
	659	static bool bd_may_claim(struct block_device bdev, struct block_device whole,
	660	void *holder)
	661	{
	662	if (bdev->bd_holder == holder)
	663	return true; /* already a holder */
	664	else if (bdev->bd_holder != NULL)
	665	return false; /* held by someone else */
	666	else if (bdev->bd_contains == bdev)
	667	return true; /* is a whole device which isn't held */
	668
	669	else if (whole->bd_holder == bd_may_claim)
	670	return true; /* is a partition of a device that is being partitioned */
	671	else if (whole->bd_holder != NULL)
	672	return false; /* is a partition of a held device */
	673	else
	674	return true; /* is a partition of an un-held device */
	675	}
	676
	677	/**
	678	* bd_prepare_to_claim - prepare to claim a block device
	679	* @bdev: block device of interest
	680	* @whole: the whole device containing @bdev, may equal @bdev
	681	* @holder: holder trying to claim @bdev
	682	*
	683	* Prepare to claim @bdev. This function fails if @bdev is already
	684	* claimed by another holder and waits if another claiming is in
	685	* progress. This function doesn't actually claim. On successful
	686	* return, the caller has ownership of bd_claiming and bd_holder[s].
	687	*
	688	* CONTEXT:
	689	* spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
	690	* it multiple times.
	691	*
	692	* RETURNS:
	693	* 0 if @bdev can be claimed, -EBUSY otherwise.
	694	*/
	695	static int bd_prepare_to_claim(struct block_device *bdev,
	696	struct block_device whole, void holder)
	697	{
	698	retry:
	699	/* if someone else claimed, fail */
	700	if (!bd_may_claim(bdev, whole, holder))
	701	return -EBUSY;
	702
	703	/* if claiming is already in progress, wait for it to finish */
	704	if (whole->bd_claiming) {
	705	wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
	706	DEFINE_WAIT(wait);
	707
	708	prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
	709	spin_unlock(&bdev_lock);
	710	schedule();
	711	finish_wait(wq, &wait);
	712	spin_lock(&bdev_lock);
	713	goto retry;
	714	}
	715
	716	/* yay, all mine */
	717	return 0;
	718	}
	719
	720	/**
	721	* bd_start_claiming - start claiming a block device
	722	* @bdev: block device of interest
	723	* @holder: holder trying to claim @bdev
	724	*
	725	* @bdev is about to be opened exclusively. Check @bdev can be opened
	726	* exclusively and mark that an exclusive open is in progress. Each
	727	* successful call to this function must be matched with a call to
	728	* either bd_finish_claiming() or bd_abort_claiming() (which do not
	729	* fail).
	730	*
	731	* This function is used to gain exclusive access to the block device
	732	* without actually causing other exclusive open attempts to fail. It
	733	* should be used when the open sequence itself requires exclusive
	734	* access but may subsequently fail.
	735	*
	736	* CONTEXT:
	737	* Might sleep.
	738	*
	739	* RETURNS:
	740	* Pointer to the block device containing @bdev on success, ERR_PTR()
	741	* value on failure.
	742	*/
	743	static struct block_device bd_start_claiming(struct block_device bdev,
	744	void *holder)
	745	{
	746	struct gendisk *disk;
	747	struct block_device *whole;
	748	int partno, err;
	749
	750	might_sleep();
	751
	752	/*
	753	* @bdev might not have been initialized properly yet, look up
	754	* and grab the outer block device the hard way.
	755	*/
	756	disk = get_gendisk(bdev->bd_dev, &partno);
	757	if (!disk)
	758	return ERR_PTR(-ENXIO);
	759
	760	whole = bdget_disk(disk, 0);
	761	module_put(disk->fops->owner);
	762	put_disk(disk);
	763	if (!whole)
	764	return ERR_PTR(-ENOMEM);
	765
	766	/* prepare to claim, if successful, mark claiming in progress */
	767	spin_lock(&bdev_lock);
	768
	769	err = bd_prepare_to_claim(bdev, whole, holder);
	770	if (err == 0) {
	771	whole->bd_claiming = holder;
	772	spin_unlock(&bdev_lock);
	773	return whole;
	774	} else {
	775	spin_unlock(&bdev_lock);
	776	bdput(whole);
	777	return ERR_PTR(err);
	778	}
	779	}
	780
	781	#ifdef CONFIG_SYSFS
	782	static int add_symlink(struct kobject from, struct kobject to)
	783	{
	784	return sysfs_create_link(from, to, kobject_name(to));
	785	}
	786
	787	static void del_symlink(struct kobject from, struct kobject to)
	788	{
	789	sysfs_remove_link(from, kobject_name(to));
	790	}
	791
	792	/**
	793	* bd_link_disk_holder - create symlinks between holding disk and slave bdev
	794	* @bdev: the claimed slave bdev
	795	* @disk: the holding disk
	796	*
	797	* This functions creates the following sysfs symlinks.
	798	*
	799	* - from "slaves" directory of the holder @disk to the claimed @bdev
	800	* - from "holders" directory of the @bdev to the holder @disk
	801	*
	802	* For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
	803	* passed to bd_link_disk_holder(), then:
	804	*
	805	* /sys/block/dm-0/slaves/sda --> /sys/block/sda
	806	* /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
	807	*
	808	* The caller must have claimed @bdev before calling this function and
	809	* ensure that both @bdev and @disk are valid during the creation and
	810	* lifetime of these symlinks.
	811	*
	812	* CONTEXT:
	813	* Might sleep.
	814	*
	815	* RETURNS:
	816	* 0 on success, -errno on failure.
	817	*/
	818	int bd_link_disk_holder(struct block_device bdev, struct gendisk disk)
	819	{
	820	int ret = 0;
	821
	822	mutex_lock(&bdev->bd_mutex);
	823
	824	WARN_ON_ONCE(!bdev->bd_holder \|\| bdev->bd_holder_disk);
	825
	826	/* FIXME: remove the following once add_disk() handles errors */
	827	if (WARN_ON(!disk->slave_dir \|\| !bdev->bd_part->holder_dir))
	828	goto out_unlock;
	829
	830	ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
	831	if (ret)
	832	goto out_unlock;
	833
	834	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
	835	if (ret) {
	836	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
	837	goto out_unlock;
	838	}
	839
	840	bdev->bd_holder_disk = disk;
	841	out_unlock:
	842	mutex_unlock(&bdev->bd_mutex);
	843	return ret;
	844	}
	845	EXPORT_SYMBOL_GPL(bd_link_disk_holder);
	846
	847	static void bd_unlink_disk_holder(struct block_device *bdev)
	848	{
	849	struct gendisk *disk = bdev->bd_holder_disk;
	850
	851	bdev->bd_holder_disk = NULL;
	852	if (!disk)
	853	return;
	854
	855	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
	856	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
	857	}
	858	#else
	859	static inline void bd_unlink_disk_holder(struct block_device *bdev)
	860	{ }
	861	#endif
	862
	863	/**
	864	* flush_disk - invalidates all buffer-cache entries on a disk
	865	*
	866	* @bdev: struct block device to be flushed
	867	*
	868	* Invalidates all buffer-cache entries on a disk. It should be called
	869	* when a disk has been changed -- either by a media change or online
	870	* resize.
	871	*/
	872	static void flush_disk(struct block_device *bdev)
	873	{
	874	if (__invalidate_device(bdev)) {
	875	char name[BDEVNAME_SIZE] = "";
	876
	877	if (bdev->bd_disk)
	878	disk_name(bdev->bd_disk, 0, name);
	879	printk(KERN_WARNING "VFS: busy inodes on changed media or "
	880	"resized disk %s\n", name);
	881	}
	882
	883	if (!bdev->bd_disk)
	884	return;
	885	if (disk_partitionable(bdev->bd_disk))
	886	bdev->bd_invalidated = 1;
	887	}
	888
	889	/**
	890	* check_disk_size_change - checks for disk size change and adjusts bdev size.
	891	* @disk: struct gendisk to check
	892	* @bdev: struct bdev to adjust.
	893	*
	894	* This routine checks to see if the bdev size does not match the disk size
	895	* and adjusts it if it differs.
	896	*/
	897	void check_disk_size_change(struct gendisk disk, struct block_device bdev)
	898	{
	899	loff_t disk_size, bdev_size;
	900
	901	disk_size = (loff_t)get_capacity(disk) << 9;
	902	bdev_size = i_size_read(bdev->bd_inode);
	903	if (disk_size != bdev_size) {
	904	char name[BDEVNAME_SIZE];
	905
	906	disk_name(disk, 0, name);
	907	printk(KERN_INFO
	908	"%s: detected capacity change from %lld to %lld\n",
	909	name, bdev_size, disk_size);
	910	i_size_write(bdev->bd_inode, disk_size);
	911	flush_disk(bdev);
	912	}
	913	}
	914	EXPORT_SYMBOL(check_disk_size_change);
	915
	916	/**
	917	* revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
	918	* @disk: struct gendisk to be revalidated
	919	*
	920	* This routine is a wrapper for lower-level driver's revalidate_disk
	921	* call-backs. It is used to do common pre and post operations needed
	922	* for all revalidate_disk operations.
	923	*/
	924	int revalidate_disk(struct gendisk *disk)
	925	{
	926	struct block_device *bdev;
	927	int ret = 0;
	928
	929	if (disk->fops->revalidate_disk)
	930	ret = disk->fops->revalidate_disk(disk);
	931
	932	bdev = bdget_disk(disk, 0);
	933	if (!bdev)
	934	return ret;
	935
	936	mutex_lock(&bdev->bd_mutex);
	937	check_disk_size_change(disk, bdev);
	938	mutex_unlock(&bdev->bd_mutex);
	939	bdput(bdev);
	940	return ret;
	941	}
	942	EXPORT_SYMBOL(revalidate_disk);
	943
	944	/*
	945	* This routine checks whether a removable media has been changed,
	946	* and invalidates all buffer-cache-entries in that case. This
	947	* is a relatively slow routine, so we have to try to minimize using
	948	* it. Thus it is called only upon a 'mount' or 'open'. This
	949	* is the best way of combining speed and utility, I think.
	950	* People changing diskettes in the middle of an operation deserve
	951	* to lose :-)
	952	*/
	953	int check_disk_change(struct block_device *bdev)
	954	{
	955	struct gendisk *disk = bdev->bd_disk;
	956	const struct block_device_operations *bdops = disk->fops;
	957	unsigned int events;
	958
	959	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE \|
	960	DISK_EVENT_EJECT_REQUEST);
	961	if (!(events & DISK_EVENT_MEDIA_CHANGE))
	962	return 0;
	963
	964	flush_disk(bdev);
	965	if (bdops->revalidate_disk)
	966	bdops->revalidate_disk(bdev->bd_disk);
	967	return 1;
	968	}
	969
	970	EXPORT_SYMBOL(check_disk_change);
	971
	972	void bd_set_size(struct block_device *bdev, loff_t size)
	973	{
	974	unsigned bsize = bdev_logical_block_size(bdev);
	975
	976	bdev->bd_inode->i_size = size;
	977	while (bsize < PAGE_CACHE_SIZE) {
	978	if (size & bsize)
	979	break;
	980	bsize <<= 1;
	981	}
	982	bdev->bd_block_size = bsize;
	983	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
	984	}
	985	EXPORT_SYMBOL(bd_set_size);
	986
	987	static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
	988
	989	/*
	990	* bd_mutex locking:
	991	*
	992	* mutex_lock(part->bd_mutex)
	993	* mutex_lock_nested(whole->bd_mutex, 1)
	994	*/
	995
	996	static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
	997	{
	998	struct gendisk *disk;
	999	int ret;
	1000	int partno;
	1001	int perm = 0;
	1002
	1003	if (mode & FMODE_READ)
	1004	perm \|= MAY_READ;
	1005	if (mode & FMODE_WRITE)
	1006	perm \|= MAY_WRITE;
	1007	/*
	1008	* hooks: /n/, see "layering violations".
	1009	*/
	1010	if (!for_part) {
	1011	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
	1012	if (ret != 0) {
	1013	bdput(bdev);
	1014	return ret;
	1015	}
	1016	}
	1017
	1018	restart:
	1019
	1020	ret = -ENXIO;
	1021	disk = get_gendisk(bdev->bd_dev, &partno);
	1022	if (!disk)
	1023	goto out;
	1024
	1025	mutex_lock_nested(&bdev->bd_mutex, for_part);
	1026	if (!bdev->bd_openers) {
	1027	bdev->bd_disk = disk;
	1028	bdev->bd_contains = bdev;
	1029	if (!partno) {
	1030	struct backing_dev_info *bdi;
	1031
	1032	ret = -ENXIO;
	1033	bdev->bd_part = disk_get_part(disk, partno);
	1034	if (!bdev->bd_part)
	1035	goto out_clear;
	1036
	1037	if (disk->fops->open) {
	1038	ret = disk->fops->open(bdev, mode);
	1039	if (ret == -ERESTARTSYS) {
	1040	/* Lost a race with 'disk' being
	1041	* deleted, try again.
	1042	* See md.c
	1043	*/
	1044	disk_put_part(bdev->bd_part);
	1045	bdev->bd_part = NULL;
	1046	module_put(disk->fops->owner);
	1047	put_disk(disk);
	1048	bdev->bd_disk = NULL;
	1049	mutex_unlock(&bdev->bd_mutex);
	1050	goto restart;
	1051	}
	1052	if (ret)
	1053	goto out_clear;
	1054	}
	1055	if (!bdev->bd_openers) {
	1056	bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
	1057	bdi = blk_get_backing_dev_info(bdev);
	1058	if (bdi == NULL)
	1059	bdi = &default_backing_dev_info;
	1060	bdev_inode_switch_bdi(bdev->bd_inode, bdi);
	1061	}
	1062	if (bdev->bd_invalidated)
	1063	rescan_partitions(disk, bdev);
	1064	} else {
	1065	struct block_device *whole;
	1066	whole = bdget_disk(disk, 0);
	1067	ret = -ENOMEM;
	1068	if (!whole)
	1069	goto out_clear;
	1070	BUG_ON(for_part);
	1071	ret = __blkdev_get(whole, mode, 1);
	1072	if (ret)
	1073	goto out_clear;
	1074	bdev->bd_contains = whole;
	1075	bdev_inode_switch_bdi(bdev->bd_inode,
	1076	whole->bd_inode->i_data.backing_dev_info);
	1077	bdev->bd_part = disk_get_part(disk, partno);
	1078	if (!(disk->flags & GENHD_FL_UP) \|\|
	1079	!bdev->bd_part \|\| !bdev->bd_part->nr_sects) {
	1080	ret = -ENXIO;
	1081	goto out_clear;
	1082	}
	1083	bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
	1084	}
	1085	} else {
	1086	module_put(disk->fops->owner);
	1087	put_disk(disk);
	1088	disk = NULL;
	1089	if (bdev->bd_contains == bdev) {
	1090	if (bdev->bd_disk->fops->open) {
	1091	ret = bdev->bd_disk->fops->open(bdev, mode);
	1092	if (ret)
	1093	goto out_unlock_bdev;
	1094	}
	1095	if (bdev->bd_invalidated)
	1096	rescan_partitions(bdev->bd_disk, bdev);
	1097	}
	1098	}
	1099	bdev->bd_openers++;
	1100	if (for_part)
	1101	bdev->bd_part_count++;
	1102	mutex_unlock(&bdev->bd_mutex);
	1103	return 0;
	1104
	1105	out_clear:
	1106	disk_put_part(bdev->bd_part);
	1107	bdev->bd_disk = NULL;
	1108	bdev->bd_part = NULL;
	1109	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
	1110	if (bdev != bdev->bd_contains)
	1111	__blkdev_put(bdev->bd_contains, mode, 1);
	1112	bdev->bd_contains = NULL;
	1113	out_unlock_bdev:
	1114	mutex_unlock(&bdev->bd_mutex);
	1115	out:
	1116	if (disk)
	1117	module_put(disk->fops->owner);
	1118	put_disk(disk);
	1119	bdput(bdev);
	1120
	1121	return ret;
	1122	}
	1123
	1124	/**
	1125	* blkdev_get - open a block device
	1126	* @bdev: block_device to open
	1127	* @mode: FMODE_* mask
	1128	* @holder: exclusive holder identifier
	1129	*
	1130	* Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
	1131	* open with exclusive access. Specifying %FMODE_EXCL with %NULL
	1132	* @holder is invalid. Exclusive opens may nest for the same @holder.
	1133	*
	1134	* On success, the reference count of @bdev is unchanged. On failure,
	1135	* @bdev is put.
	1136	*
	1137	* CONTEXT:
	1138	* Might sleep.
	1139	*
	1140	* RETURNS:
	1141	* 0 on success, -errno on failure.
	1142	*/
	1143	int blkdev_get(struct block_device bdev, fmode_t mode, void holder)
	1144	{
	1145	struct block_device *whole = NULL;
	1146	int res;
	1147
	1148	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
	1149
	1150	if ((mode & FMODE_EXCL) && holder) {
	1151	whole = bd_start_claiming(bdev, holder);
	1152	if (IS_ERR(whole)) {
	1153	bdput(bdev);
	1154	return PTR_ERR(whole);
	1155	}
	1156	}
	1157
	1158	res = __blkdev_get(bdev, mode, 0);
	1159
	1160	/* __blkdev_get() may alter read only status, check it afterwards */
	1161	if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
	1162	__blkdev_put(bdev, mode, 0);
	1163	res = -EACCES;
	1164	}
	1165
	1166	if (whole) {
	1167	/* finish claiming */
	1168	mutex_lock(&bdev->bd_mutex);
	1169	spin_lock(&bdev_lock);
	1170
	1171	if (!res) {
	1172	BUG_ON(!bd_may_claim(bdev, whole, holder));
	1173	/*
	1174	* Note that for a whole device bd_holders
	1175	* will be incremented twice, and bd_holder
	1176	* will be set to bd_may_claim before being
	1177	* set to holder
	1178	*/
	1179	whole->bd_holders++;
	1180	whole->bd_holder = bd_may_claim;
	1181	bdev->bd_holders++;
	1182	bdev->bd_holder = holder;
	1183	}
	1184
	1185	/* tell others that we're done */
	1186	BUG_ON(whole->bd_claiming != holder);
	1187	whole->bd_claiming = NULL;
	1188	wake_up_bit(&whole->bd_claiming, 0);
	1189
	1190	spin_unlock(&bdev_lock);
	1191
	1192	/*
	1193	* Block event polling for write claims. Any write
	1194	* holder makes the write_holder state stick until all
	1195	* are released. This is good enough and tracking
	1196	* individual writeable reference is too fragile given
	1197	* the way @mode is used in blkdev_get/put().
	1198	*/
	1199	if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
	1200	bdev->bd_write_holder = true;
	1201	disk_block_events(bdev->bd_disk);
	1202	}
	1203
	1204	mutex_unlock(&bdev->bd_mutex);
	1205	bdput(whole);
	1206	}
	1207
	1208	return res;
	1209	}
	1210	EXPORT_SYMBOL(blkdev_get);
	1211
	1212	/**
	1213	* blkdev_get_by_path - open a block device by name
	1214	* @path: path to the block device to open
	1215	* @mode: FMODE_* mask
	1216	* @holder: exclusive holder identifier
	1217	*
	1218	* Open the blockdevice described by the device file at @path. @mode
	1219	* and @holder are identical to blkdev_get().
	1220	*
	1221	* On success, the returned block_device has reference count of one.
	1222	*
	1223	* CONTEXT:
	1224	* Might sleep.
	1225	*
	1226	* RETURNS:
	1227	* Pointer to block_device on success, ERR_PTR(-errno) on failure.
	1228	*/
	1229	struct block_device blkdev_get_by_path(const char path, fmode_t mode,
	1230	void *holder)
	1231	{
	1232	struct block_device *bdev;
	1233	int err;
	1234
	1235	bdev = lookup_bdev(path);
	1236	if (IS_ERR(bdev))
	1237	return bdev;
	1238
	1239	err = blkdev_get(bdev, mode, holder);
	1240	if (err)
	1241	return ERR_PTR(err);
	1242
	1243	return bdev;
	1244	}
	1245	EXPORT_SYMBOL(blkdev_get_by_path);
	1246
	1247	/**
	1248	* blkdev_get_by_dev - open a block device by device number
	1249	* @dev: device number of block device to open
	1250	* @mode: FMODE_* mask
	1251	* @holder: exclusive holder identifier
	1252	*
	1253	* Open the blockdevice described by device number @dev. @mode and
	1254	* @holder are identical to blkdev_get().
	1255	*
	1256	* Use it ONLY if you really do not have anything better - i.e. when
	1257	* you are behind a truly sucky interface and all you are given is a
	1258	* device number. _Never_ to be used for internal purposes. If you
	1259	* ever need it - reconsider your API.
	1260	*
	1261	* On success, the returned block_device has reference count of one.
	1262	*
	1263	* CONTEXT:
	1264	* Might sleep.
	1265	*
	1266	* RETURNS:
	1267	* Pointer to block_device on success, ERR_PTR(-errno) on failure.
	1268	*/
	1269	struct block_device blkdev_get_by_dev(dev_t dev, fmode_t mode, void holder)
	1270	{
	1271	struct block_device *bdev;
	1272	int err;
	1273
	1274	bdev = bdget(dev);
	1275	if (!bdev)
	1276	return ERR_PTR(-ENOMEM);
	1277
	1278	err = blkdev_get(bdev, mode, holder);
	1279	if (err)
	1280	return ERR_PTR(err);
	1281
	1282	return bdev;
	1283	}
	1284	EXPORT_SYMBOL(blkdev_get_by_dev);
	1285
	1286	static int blkdev_open(struct inode * inode, struct file * filp)
	1287	{
	1288	struct block_device *bdev;
	1289
	1290	/*
	1291	* Preserve backwards compatibility and allow large file access
	1292	* even if userspace doesn't ask for it explicitly. Some mkfs
	1293	* binary needs it. We might want to drop this workaround
	1294	* during an unstable branch.
	1295	*/
	1296	filp->f_flags \|= O_LARGEFILE;
	1297
	1298	if (filp->f_flags & O_NDELAY)
	1299	filp->f_mode \|= FMODE_NDELAY;
	1300	if (filp->f_flags & O_EXCL)
	1301	filp->f_mode \|= FMODE_EXCL;
	1302	if ((filp->f_flags & O_ACCMODE) == 3)
	1303	filp->f_mode \|= FMODE_WRITE_IOCTL;
	1304
	1305	bdev = bd_acquire(inode);
	1306	if (bdev == NULL)
	1307	return -ENOMEM;
	1308
	1309	filp->f_mapping = bdev->bd_inode->i_mapping;
	1310
	1311	return blkdev_get(bdev, filp->f_mode, filp);
	1312	}
	1313
	1314	static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
	1315	{
	1316	int ret = 0;
	1317	struct gendisk *disk = bdev->bd_disk;
	1318	struct block_device *victim = NULL;
	1319
	1320	mutex_lock_nested(&bdev->bd_mutex, for_part);
	1321	if (for_part)
	1322	bdev->bd_part_count--;
	1323
	1324	if (!--bdev->bd_openers) {
	1325	WARN_ON_ONCE(bdev->bd_holders);
	1326	sync_blockdev(bdev);
	1327	kill_bdev(bdev);
	1328	}
	1329	if (bdev->bd_contains == bdev) {
	1330	if (disk->fops->release)
	1331	ret = disk->fops->release(disk, mode);
	1332	}
	1333	if (!bdev->bd_openers) {
	1334	struct module *owner = disk->fops->owner;
	1335
	1336	put_disk(disk);
	1337	module_put(owner);
	1338	disk_put_part(bdev->bd_part);
	1339	bdev->bd_part = NULL;
	1340	bdev->bd_disk = NULL;
	1341	bdev_inode_switch_bdi(bdev->bd_inode,
	1342	&default_backing_dev_info);
	1343	if (bdev != bdev->bd_contains)
	1344	victim = bdev->bd_contains;
	1345	bdev->bd_contains = NULL;
	1346	}
	1347	mutex_unlock(&bdev->bd_mutex);
	1348	bdput(bdev);
	1349	if (victim)
	1350	__blkdev_put(victim, mode, 1);
	1351	return ret;
	1352	}
	1353
	1354	int blkdev_put(struct block_device *bdev, fmode_t mode)
	1355	{
	1356	if (mode & FMODE_EXCL) {
	1357	bool bdev_free;
	1358
	1359	/*
	1360	* Release a claim on the device. The holder fields
	1361	* are protected with bdev_lock. bd_mutex is to
	1362	* synchronize disk_holder unlinking.
	1363	*/
	1364	mutex_lock(&bdev->bd_mutex);
	1365	spin_lock(&bdev_lock);
	1366
	1367	WARN_ON_ONCE(--bdev->bd_holders < 0);
	1368	WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
	1369
	1370	/* bd_contains might point to self, check in a separate step */
	1371	if ((bdev_free = !bdev->bd_holders))
	1372	bdev->bd_holder = NULL;
	1373	if (!bdev->bd_contains->bd_holders)
	1374	bdev->bd_contains->bd_holder = NULL;
	1375
	1376	spin_unlock(&bdev_lock);
	1377
	1378	/*
	1379	* If this was the last claim, remove holder link and
	1380	* unblock evpoll if it was a write holder.
	1381	*/
	1382	if (bdev_free) {
	1383	bd_unlink_disk_holder(bdev);
	1384	if (bdev->bd_write_holder) {
	1385	disk_unblock_events(bdev->bd_disk);
	1386	bdev->bd_write_holder = false;
	1387	} else
	1388	disk_check_events(bdev->bd_disk);
	1389	}
	1390
	1391	mutex_unlock(&bdev->bd_mutex);
	1392	} else
	1393	disk_check_events(bdev->bd_disk);
	1394
	1395	return __blkdev_put(bdev, mode, 0);
	1396	}
	1397	EXPORT_SYMBOL(blkdev_put);
	1398
	1399	static int blkdev_close(struct inode * inode, struct file * filp)
	1400	{
	1401	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
	1402
	1403	return blkdev_put(bdev, filp->f_mode);
	1404	}
	1405
	1406	static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
	1407	{
	1408	struct block_device *bdev = I_BDEV(file->f_mapping->host);
	1409	fmode_t mode = file->f_mode;
	1410
	1411	/*
	1412	* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
	1413	* to updated it before every ioctl.
	1414	*/
	1415	if (file->f_flags & O_NDELAY)
	1416	mode \|= FMODE_NDELAY;
	1417	else
	1418	mode &= ~FMODE_NDELAY;
	1419
	1420	return blkdev_ioctl(bdev, mode, cmd, arg);
	1421	}
	1422
	1423	/*
	1424	* Write data to the block device. Only intended for the block device itself
	1425	* and the raw driver which basically is a fake block device.
	1426	*
	1427	* Does not take i_mutex for the write and thus is not for general purpose
	1428	* use.
	1429	*/
	1430	ssize_t blkdev_aio_write(struct kiocb iocb, const struct iovec iov,
	1431	unsigned long nr_segs, loff_t pos)
	1432	{
	1433	struct file *file = iocb->ki_filp;
	1434	ssize_t ret;
	1435
	1436	BUG_ON(iocb->ki_pos != pos);
	1437
	1438	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
	1439	if (ret > 0 \|\| ret == -EIOCBQUEUED) {
	1440	ssize_t err;
	1441
	1442	err = generic_write_sync(file, pos, ret);
	1443	if (err < 0 && ret > 0)
	1444	ret = err;
	1445	}
	1446	return ret;
	1447	}
	1448	EXPORT_SYMBOL_GPL(blkdev_aio_write);
	1449
	1450	/*
	1451	* Try to release a page associated with block device when the system
	1452	* is under memory pressure.
	1453	*/
	1454	static int blkdev_releasepage(struct page *page, gfp_t wait)
	1455	{
	1456	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
	1457
	1458	if (super && super->s_op->bdev_try_to_free_page)
	1459	return super->s_op->bdev_try_to_free_page(super, page, wait);
	1460
	1461	return try_to_free_buffers(page);
	1462	}
	1463
	1464	static const struct address_space_operations def_blk_aops = {
	1465	.readpage = blkdev_readpage,
	1466	.writepage = blkdev_writepage,
	1467	.sync_page = block_sync_page,
	1468	.write_begin = blkdev_write_begin,
	1469	.write_end = blkdev_write_end,
	1470	.writepages = generic_writepages,
	1471	.releasepage = blkdev_releasepage,
	1472	.direct_IO = blkdev_direct_IO,
	1473	};
	1474
	1475	const struct file_operations def_blk_fops = {
	1476	.open = blkdev_open,
	1477	.release = blkdev_close,
	1478	.llseek = block_llseek,
	1479	.read = do_sync_read,
	1480	.write = do_sync_write,
	1481	.aio_read = generic_file_aio_read,
	1482	.aio_write = blkdev_aio_write,
	1483	.mmap = generic_file_mmap,
	1484	.fsync = blkdev_fsync,
	1485	.unlocked_ioctl = block_ioctl,
	1486	#ifdef CONFIG_COMPAT
	1487	.compat_ioctl = compat_blkdev_ioctl,
	1488	#endif
	1489	.splice_read = generic_file_splice_read,
	1490	.splice_write = generic_file_splice_write,
	1491	};
	1492
	1493	int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
	1494	{
	1495	int res;
	1496	mm_segment_t old_fs = get_fs();
	1497	set_fs(KERNEL_DS);
	1498	res = blkdev_ioctl(bdev, 0, cmd, arg);
	1499	set_fs(old_fs);
	1500	return res;
	1501	}
	1502
	1503	EXPORT_SYMBOL(ioctl_by_bdev);
	1504
	1505	/**
	1506	* lookup_bdev - lookup a struct block_device by name
	1507	* @pathname: special file representing the block device
	1508	*
	1509	* Get a reference to the blockdevice at @pathname in the current
	1510	* namespace if possible and return it. Return ERR_PTR(error)
	1511	* otherwise.
	1512	*/
	1513	struct block_device lookup_bdev(const char pathname)
	1514	{
	1515	struct block_device *bdev;
	1516	struct inode *inode;
	1517	struct path path;
	1518	int error;
	1519
	1520	if (!pathname \|\| !*pathname)
	1521	return ERR_PTR(-EINVAL);
	1522
	1523	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
	1524	if (error)
	1525	return ERR_PTR(error);
	1526
	1527	inode = path.dentry->d_inode;
	1528	error = -ENOTBLK;
	1529	if (!S_ISBLK(inode->i_mode))
	1530	goto fail;
	1531	error = -EACCES;
	1532	if (path.mnt->mnt_flags & MNT_NODEV)
	1533	goto fail;
	1534	error = -ENOMEM;
	1535	bdev = bd_acquire(inode);
	1536	if (!bdev)
	1537	goto fail;
	1538	out:
	1539	path_put(&path);
	1540	return bdev;
	1541	fail:
	1542	bdev = ERR_PTR(error);
	1543	goto out;
	1544	}
	1545	EXPORT_SYMBOL(lookup_bdev);
	1546
	1547	int __invalidate_device(struct block_device *bdev)
	1548	{
	1549	struct super_block *sb = get_super(bdev);
	1550	int res = 0;
	1551
	1552	if (sb) {
	1553	/*
	1554	* no need to lock the super, get_super holds the
	1555	* read mutex so the filesystem cannot go away
	1556	* under us (->put_super runs with the write lock
	1557	* hold).
	1558	*/
	1559	shrink_dcache_sb(sb);
	1560	res = invalidate_inodes(sb);
	1561	drop_super(sb);
	1562	}
	1563	invalidate_bdev(bdev);
	1564	return res;
	1565	}
	1566	EXPORT_SYMBOL(__invalidate_device);