From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 27 May 2010 17:43:44 +0000 (-0700)
Subject: Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
X-Git-Tag: v2.6.35-rc1~36
X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/105a048a4f35f7a74c7cc20b36dd83658b6ec232?hp=-c

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (27 commits)
  Btrfs: add more error checking to btrfs_dirty_inode
  Btrfs: allow unaligned DIO
  Btrfs: drop verbose enospc printk
  Btrfs: Fix block generation verification race
  Btrfs: fix preallocation and nodatacow checks in O_DIRECT
  Btrfs: avoid ENOSPC errors in btrfs_dirty_inode
  Btrfs: move O_DIRECT space reservation to btrfs_direct_IO
  Btrfs: rework O_DIRECT enospc handling
  Btrfs: use async helpers for DIO write checksumming
  Btrfs: don't walk around with task->state != TASK_RUNNING
  Btrfs: do aio_write instead of write
  Btrfs: add basic DIO read/write support
  direct-io: do not merge logically non-contiguous requests
  direct-io: add a hook for the fs to provide its own submit_bio function
  fs: allow short direct-io reads to be completed via buffered IO
  Btrfs: Metadata ENOSPC handling for balance
  Btrfs: Pre-allocate space for data relocation
  Btrfs: Metadata ENOSPC handling for tree log
  Btrfs: Metadata reservation for orphan inodes
  Btrfs: Introduce global metadata reservation
  ...
---

105a048a4f35f7a74c7cc20b36dd83658b6ec232
diff --combined fs/btrfs/extent-tree.c
index c6a4f459ad76,6c14101506e1..b9080d71991a
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -35,10 -35,9 +35,9 @@@
  
  static int update_block_group(struct btrfs_trans_handle *trans,
  			      struct btrfs_root *root,
- 			      u64 bytenr, u64 num_bytes, int alloc,
- 			      int mark_free);
- static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- 				   u64 num_bytes, int reserve);
+ 			      u64 bytenr, u64 num_bytes, int alloc);
+ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ 				 u64 num_bytes, int reserve, int sinfo);
  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  				struct btrfs_root *root,
  				u64 bytenr, u64 num_bytes, u64 parent,
@@@ -61,12 -60,6 +60,6 @@@ static int alloc_reserved_tree_block(st
  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  			  struct btrfs_root *extent_root, u64 alloc_bytes,
  			  u64 flags, int force);
- static int pin_down_bytes(struct btrfs_trans_handle *trans,
- 			  struct btrfs_root *root,
- 			  struct btrfs_path *path,
- 			  u64 bytenr, u64 num_bytes,
- 			  int is_data, int reserved,
- 			  struct extent_buffer **must_clean);
  static int find_next_key(struct btrfs_path *path, int level,
  			 struct btrfs_key *key);
  static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@@ -91,8 -84,12 +84,12 @@@ void btrfs_get_block_group(struct btrfs
  
  void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
  {
- 	if (atomic_dec_and_test(&cache->count))
+ 	if (atomic_dec_and_test(&cache->count)) {
+ 		WARN_ON(cache->pinned > 0);
+ 		WARN_ON(cache->reserved > 0);
+ 		WARN_ON(cache->reserved_pinned > 0);
  		kfree(cache);
+ 	}
  }
  
  /*
@@@ -319,7 -316,7 +316,7 @@@ static int caching_kthread(void *data
  
  	exclude_super_stripes(extent_root, block_group);
  	spin_lock(&block_group->space_info->lock);
- 	block_group->space_info->bytes_super += block_group->bytes_super;
+ 	block_group->space_info->bytes_readonly += block_group->bytes_super;
  	spin_unlock(&block_group->space_info->lock);
  
  	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@@ -507,6 -504,9 +504,9 @@@ static struct btrfs_space_info *__find_
  	struct list_head *head = &info->space_info;
  	struct btrfs_space_info *found;
  
+ 	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+ 		 BTRFS_BLOCK_GROUP_METADATA;
+ 
  	rcu_read_lock();
  	list_for_each_entry_rcu(found, head, list) {
  		if (found->flags == flags) {
@@@ -609,6 -609,113 +609,113 @@@ int btrfs_lookup_extent(struct btrfs_ro
  	return ret;
  }
  
+ /*
+  * helper function to lookup reference count and flags of extent.
+  *
+  * the head node for delayed ref is used to store the sum of all the
+  * reference count modifications queued up in the rbtree. the head
+  * node may also store the extent flags to set. This way you can check
+  * to see what the reference count and extent flags would be if all of
+  * the delayed refs are not processed.
+  */
+ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ 			     struct btrfs_root *root, u64 bytenr,
+ 			     u64 num_bytes, u64 *refs, u64 *flags)
+ {
+ 	struct btrfs_delayed_ref_head *head;
+ 	struct btrfs_delayed_ref_root *delayed_refs;
+ 	struct btrfs_path *path;
+ 	struct btrfs_extent_item *ei;
+ 	struct extent_buffer *leaf;
+ 	struct btrfs_key key;
+ 	u32 item_size;
+ 	u64 num_refs;
+ 	u64 extent_flags;
+ 	int ret;
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+ 	key.objectid = bytenr;
+ 	key.type = BTRFS_EXTENT_ITEM_KEY;
+ 	key.offset = num_bytes;
+ 	if (!trans) {
+ 		path->skip_locking = 1;
+ 		path->search_commit_root = 1;
+ 	}
+ again:
+ 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ 				&key, path, 0, 0);
+ 	if (ret < 0)
+ 		goto out_free;
+ 
+ 	if (ret == 0) {
+ 		leaf = path->nodes[0];
+ 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ 		if (item_size >= sizeof(*ei)) {
+ 			ei = btrfs_item_ptr(leaf, path->slots[0],
+ 					    struct btrfs_extent_item);
+ 			num_refs = btrfs_extent_refs(leaf, ei);
+ 			extent_flags = btrfs_extent_flags(leaf, ei);
+ 		} else {
+ #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ 			struct btrfs_extent_item_v0 *ei0;
+ 			BUG_ON(item_size != sizeof(*ei0));
+ 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ 					     struct btrfs_extent_item_v0);
+ 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
+ 			/* FIXME: this isn't correct for data */
+ 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ #else
+ 			BUG();
+ #endif
+ 		}
+ 		BUG_ON(num_refs == 0);
+ 	} else {
+ 		num_refs = 0;
+ 		extent_flags = 0;
+ 		ret = 0;
+ 	}
+ 
+ 	if (!trans)
+ 		goto out;
+ 
+ 	delayed_refs = &trans->transaction->delayed_refs;
+ 	spin_lock(&delayed_refs->lock);
+ 	head = btrfs_find_delayed_ref_head(trans, bytenr);
+ 	if (head) {
+ 		if (!mutex_trylock(&head->mutex)) {
+ 			atomic_inc(&head->node.refs);
+ 			spin_unlock(&delayed_refs->lock);
+ 
+ 			btrfs_release_path(root->fs_info->extent_root, path);
+ 
+ 			mutex_lock(&head->mutex);
+ 			mutex_unlock(&head->mutex);
+ 			btrfs_put_delayed_ref(&head->node);
+ 			goto again;
+ 		}
+ 		if (head->extent_op && head->extent_op->update_flags)
+ 			extent_flags |= head->extent_op->flags_to_set;
+ 		else
+ 			BUG_ON(num_refs == 0);
+ 
+ 		num_refs += head->node.ref_mod;
+ 		mutex_unlock(&head->mutex);
+ 	}
+ 	spin_unlock(&delayed_refs->lock);
+ out:
+ 	WARN_ON(num_refs == 0);
+ 	if (refs)
+ 		*refs = num_refs;
+ 	if (flags)
+ 		*flags = extent_flags;
+ out_free:
+ 	btrfs_free_path(path);
+ 	return ret;
+ }
+ 
  /*
   * Back reference rules.  Back refs have three main goals:
   *
@@@ -1589,7 -1696,7 +1696,7 @@@ static void btrfs_issue_discard(struct 
  				u64 start, u64 len)
  {
  	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
 -			     DISCARD_FL_BARRIER);
 +			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
  }
  
  static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@@ -1871,7 -1978,6 +1978,6 @@@ static int run_delayed_tree_ref(struct 
  	return ret;
  }
  
- 
  /* helper function to actually process a single delayed ref entry */
  static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
  			       struct btrfs_root *root,
@@@ -1891,32 -1997,14 +1997,14 @@@
  		BUG_ON(extent_op);
  		head = btrfs_delayed_node_to_head(node);
  		if (insert_reserved) {
- 			int mark_free = 0;
- 			struct extent_buffer *must_clean = NULL;
- 
- 			ret = pin_down_bytes(trans, root, NULL,
- 					     node->bytenr, node->num_bytes,
- 					     head->is_data, 1, &must_clean);
- 			if (ret > 0)
- 				mark_free = 1;
- 
- 			if (must_clean) {
- 				clean_tree_block(NULL, root, must_clean);
- 				btrfs_tree_unlock(must_clean);
- 				free_extent_buffer(must_clean);
- 			}
+ 			btrfs_pin_extent(root, node->bytenr,
+ 					 node->num_bytes, 1);
  			if (head->is_data) {
  				ret = btrfs_del_csums(trans, root,
  						      node->bytenr,
  						      node->num_bytes);
  				BUG_ON(ret);
  			}
- 			if (mark_free) {
- 				ret = btrfs_free_reserved_extent(root,
- 							node->bytenr,
- 							node->num_bytes);
- 				BUG_ON(ret);
- 			}
  		}
  		mutex_unlock(&head->mutex);
  		return 0;
@@@ -2347,6 -2435,8 +2435,8 @@@ int btrfs_cross_ref_exist(struct btrfs_
  		ret = 0;
  out:
  	btrfs_free_path(path);
+ 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ 		WARN_ON(ret > 0);
  	return ret;
  }
  
@@@ -2660,12 -2750,21 +2750,21 @@@ static int update_space_info(struct btr
  			     struct btrfs_space_info **space_info)
  {
  	struct btrfs_space_info *found;
+ 	int i;
+ 	int factor;
+ 
+ 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ 		     BTRFS_BLOCK_GROUP_RAID10))
+ 		factor = 2;
+ 	else
+ 		factor = 1;
  
  	found = __find_space_info(info, flags);
  	if (found) {
  		spin_lock(&found->lock);
  		found->total_bytes += total_bytes;
  		found->bytes_used += bytes_used;
+ 		found->disk_used += bytes_used * factor;
  		found->full = 0;
  		spin_unlock(&found->lock);
  		*space_info = found;
@@@ -2675,18 -2774,20 +2774,20 @@@
  	if (!found)
  		return -ENOMEM;
  
- 	INIT_LIST_HEAD(&found->block_groups);
+ 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ 		INIT_LIST_HEAD(&found->block_groups[i]);
  	init_rwsem(&found->groups_sem);
- 	init_waitqueue_head(&found->flush_wait);
- 	init_waitqueue_head(&found->allocate_wait);
  	spin_lock_init(&found->lock);
- 	found->flags = flags;
+ 	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+ 				BTRFS_BLOCK_GROUP_SYSTEM |
+ 				BTRFS_BLOCK_GROUP_METADATA);
  	found->total_bytes = total_bytes;
  	found->bytes_used = bytes_used;
+ 	found->disk_used = bytes_used * factor;
  	found->bytes_pinned = 0;
  	found->bytes_reserved = 0;
  	found->bytes_readonly = 0;
- 	found->bytes_delalloc = 0;
+ 	found->bytes_may_use = 0;
  	found->full = 0;
  	found->force_alloc = 0;
  	*space_info = found;
@@@ -2711,19 -2812,6 +2812,6 @@@ static void set_avail_alloc_bits(struc
  	}
  }
  
- static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
- {
- 	spin_lock(&cache->space_info->lock);
- 	spin_lock(&cache->lock);
- 	if (!cache->ro) {
- 		cache->space_info->bytes_readonly += cache->key.offset -
- 					btrfs_block_group_used(&cache->item);
- 		cache->ro = 1;
- 	}
- 	spin_unlock(&cache->lock);
- 	spin_unlock(&cache->space_info->lock);
- }
- 
  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
  	u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@@ -2752,722 -2840,946 +2840,946 @@@
  	return flags;
  }
  
- static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
- {
- 	struct btrfs_fs_info *info = root->fs_info;
- 	u64 alloc_profile;
- 
- 	if (data) {
- 		alloc_profile = info->avail_data_alloc_bits &
- 			info->data_alloc_profile;
- 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
- 	} else if (root == root->fs_info->chunk_root) {
- 		alloc_profile = info->avail_system_alloc_bits &
- 			info->system_alloc_profile;
- 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
- 	} else {
- 		alloc_profile = info->avail_metadata_alloc_bits &
- 			info->metadata_alloc_profile;
- 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
- 	}
- 
- 	return btrfs_reduce_alloc_profile(root, data);
- }
- 
- void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
  {
- 	u64 alloc_target;
- 
- 	alloc_target = btrfs_get_alloc_profile(root, 1);
- 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
- 						       alloc_target);
+ 	if (flags & BTRFS_BLOCK_GROUP_DATA)
+ 		flags |= root->fs_info->avail_data_alloc_bits &
+ 			 root->fs_info->data_alloc_profile;
+ 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ 		flags |= root->fs_info->avail_system_alloc_bits &
+ 			 root->fs_info->system_alloc_profile;
+ 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ 		flags |= root->fs_info->avail_metadata_alloc_bits &
+ 			 root->fs_info->metadata_alloc_profile;
+ 	return btrfs_reduce_alloc_profile(root, flags);
  }
  
- static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+ static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
  {
- 	u64 num_bytes;
- 	int level;
- 
- 	level = BTRFS_MAX_LEVEL - 2;
- 	/*
- 	 * NOTE: these calculations are absolutely the worst possible case.
- 	 * This assumes that _every_ item we insert will require a new leaf, and
- 	 * that the tree has grown to its maximum level size.
- 	 */
+ 	u64 flags;
  
- 	/*
- 	 * for every item we insert we could insert both an extent item and a
- 	 * extent ref item.  Then for ever item we insert, we will need to cow
- 	 * both the original leaf, plus the leaf to the left and right of it.
- 	 *
- 	 * Unless we are talking about the extent root, then we just want the
- 	 * number of items * 2, since we just need the extent item plus its ref.
- 	 */
- 	if (root == root->fs_info->extent_root)
- 		num_bytes = num_items * 2;
+ 	if (data)
+ 		flags = BTRFS_BLOCK_GROUP_DATA;
+ 	else if (root == root->fs_info->chunk_root)
+ 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
  	else
- 		num_bytes = (num_items + (2 * num_items)) * 3;
+ 		flags = BTRFS_BLOCK_GROUP_METADATA;
  
- 	/*
- 	 * num_bytes is total number of leaves we could need times the leaf
- 	 * size, and then for every leaf we could end up cow'ing 2 nodes per
- 	 * level, down to the leaf level.
- 	 */
- 	num_bytes = (num_bytes * root->leafsize) +
- 		(num_bytes * (level * 2)) * root->nodesize;
+ 	return get_alloc_profile(root, flags);
+ }
  
- 	return num_bytes;
+ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+ {
+ 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+ 						       BTRFS_BLOCK_GROUP_DATA);
  }
  
  /*
-  * Unreserve metadata space for delalloc.  If we have less reserved credits than
-  * we have extents, this function does nothing.
+  * This will check the space that the inode allocates from to make sure we have
+  * enough space for bytes.
   */
- int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
- 					  struct inode *inode, int num_items)
+ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
  {
- 	struct btrfs_fs_info *info = root->fs_info;
- 	struct btrfs_space_info *meta_sinfo;
- 	u64 num_bytes;
- 	u64 alloc_target;
- 	bool bug = false;
+ 	struct btrfs_space_info *data_sinfo;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	u64 used;
+ 	int ret = 0, committed = 0;
  
- 	/* get the space info for where the metadata will live */
- 	alloc_target = btrfs_get_alloc_profile(root, 0);
- 	meta_sinfo = __find_space_info(info, alloc_target);
+ 	/* make sure bytes are sectorsize aligned */
+ 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  
- 	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- 					   num_items);
+ 	data_sinfo = BTRFS_I(inode)->space_info;
+ 	if (!data_sinfo)
+ 		goto alloc;
  
- 	spin_lock(&meta_sinfo->lock);
- 	spin_lock(&BTRFS_I(inode)->accounting_lock);
- 	if (BTRFS_I(inode)->reserved_extents <=
- 	    BTRFS_I(inode)->outstanding_extents) {
- 		spin_unlock(&BTRFS_I(inode)->accounting_lock);
- 		spin_unlock(&meta_sinfo->lock);
- 		return 0;
- 	}
- 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ again:
+ 	/* make sure we have enough space to handle the data first */
+ 	spin_lock(&data_sinfo->lock);
+ 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+ 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+ 		data_sinfo->bytes_may_use;
+ 
+ 	if (used + bytes > data_sinfo->total_bytes) {
+ 		struct btrfs_trans_handle *trans;
  
- 	BTRFS_I(inode)->reserved_extents -= num_items;
- 	BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
+ 		/*
+ 		 * if we don't have enough free bytes in this space then we need
+ 		 * to alloc a new chunk.
+ 		 */
+ 		if (!data_sinfo->full) {
+ 			u64 alloc_target;
  
- 	if (meta_sinfo->bytes_delalloc < num_bytes) {
- 		bug = true;
- 		meta_sinfo->bytes_delalloc = 0;
- 	} else {
- 		meta_sinfo->bytes_delalloc -= num_bytes;
- 	}
- 	spin_unlock(&meta_sinfo->lock);
+ 			data_sinfo->force_alloc = 1;
+ 			spin_unlock(&data_sinfo->lock);
+ alloc:
+ 			alloc_target = btrfs_get_alloc_profile(root, 1);
+ 			trans = btrfs_join_transaction(root, 1);
+ 			if (IS_ERR(trans))
+ 				return PTR_ERR(trans);
  
- 	BUG_ON(bug);
+ 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 					     bytes + 2 * 1024 * 1024,
+ 					     alloc_target, 0);
+ 			btrfs_end_transaction(trans, root);
+ 			if (ret < 0)
+ 				return ret;
  
- 	return 0;
- }
+ 			if (!data_sinfo) {
+ 				btrfs_set_inode_space_info(root, inode);
+ 				data_sinfo = BTRFS_I(inode)->space_info;
+ 			}
+ 			goto again;
+ 		}
+ 		spin_unlock(&data_sinfo->lock);
  
- static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
- {
- 	u64 thresh;
+ 		/* commit the current transaction and try again */
+ 		if (!committed && !root->fs_info->open_ioctl_trans) {
+ 			committed = 1;
+ 			trans = btrfs_join_transaction(root, 1);
+ 			if (IS_ERR(trans))
+ 				return PTR_ERR(trans);
+ 			ret = btrfs_commit_transaction(trans, root);
+ 			if (ret)
+ 				return ret;
+ 			goto again;
+ 		}
  
- 	thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- 		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- 		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- 		meta_sinfo->bytes_may_use;
+ #if 0 /* I hope we never need this code again, just in case */
+ 		printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
+ 		       "%llu bytes_reserved, " "%llu bytes_pinned, "
+ 		       "%llu bytes_readonly, %llu may use %llu total\n",
+ 		       (unsigned long long)bytes,
+ 		       (unsigned long long)data_sinfo->bytes_used,
+ 		       (unsigned long long)data_sinfo->bytes_reserved,
+ 		       (unsigned long long)data_sinfo->bytes_pinned,
+ 		       (unsigned long long)data_sinfo->bytes_readonly,
+ 		       (unsigned long long)data_sinfo->bytes_may_use,
+ 		       (unsigned long long)data_sinfo->total_bytes);
+ #endif
+ 		return -ENOSPC;
+ 	}
+ 	data_sinfo->bytes_may_use += bytes;
+ 	BTRFS_I(inode)->reserved_bytes += bytes;
+ 	spin_unlock(&data_sinfo->lock);
  
- 	thresh = meta_sinfo->total_bytes - thresh;
- 	thresh *= 80;
- 	do_div(thresh, 100);
- 	if (thresh <= meta_sinfo->bytes_delalloc)
- 		meta_sinfo->force_delalloc = 1;
- 	else
- 		meta_sinfo->force_delalloc = 0;
+ 	return 0;
  }
  
- struct async_flush {
- 	struct btrfs_root *root;
- 	struct btrfs_space_info *info;
- 	struct btrfs_work work;
- };
- 
- static noinline void flush_delalloc_async(struct btrfs_work *work)
+ /*
+  * called when we are clearing an delalloc extent from the
+  * inode's io_tree or there was an error for whatever reason
+  * after calling btrfs_check_data_free_space
+  */
+ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
  {
- 	struct async_flush *async;
- 	struct btrfs_root *root;
- 	struct btrfs_space_info *info;
- 
- 	async = container_of(work, struct async_flush, work);
- 	root = async->root;
- 	info = async->info;
- 
- 	btrfs_start_delalloc_inodes(root, 0);
- 	wake_up(&info->flush_wait);
- 	btrfs_wait_ordered_extents(root, 0, 0);
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_space_info *data_sinfo;
  
- 	spin_lock(&info->lock);
- 	info->flushing = 0;
- 	spin_unlock(&info->lock);
- 	wake_up(&info->flush_wait);
+ 	/* make sure bytes are sectorsize aligned */
+ 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
  
- 	kfree(async);
+ 	data_sinfo = BTRFS_I(inode)->space_info;
+ 	spin_lock(&data_sinfo->lock);
+ 	data_sinfo->bytes_may_use -= bytes;
+ 	BTRFS_I(inode)->reserved_bytes -= bytes;
+ 	spin_unlock(&data_sinfo->lock);
  }
  
- static void wait_on_flush(struct btrfs_space_info *info)
+ static void force_metadata_allocation(struct btrfs_fs_info *info)
  {
- 	DEFINE_WAIT(wait);
- 	u64 used;
- 
- 	while (1) {
- 		prepare_to_wait(&info->flush_wait, &wait,
- 				TASK_UNINTERRUPTIBLE);
- 		spin_lock(&info->lock);
- 		if (!info->flushing) {
- 			spin_unlock(&info->lock);
- 			break;
- 		}
+ 	struct list_head *head = &info->space_info;
+ 	struct btrfs_space_info *found;
  
- 		used = info->bytes_used + info->bytes_reserved +
- 			info->bytes_pinned + info->bytes_readonly +
- 			info->bytes_super + info->bytes_root +
- 			info->bytes_may_use + info->bytes_delalloc;
- 		if (used < info->total_bytes) {
- 			spin_unlock(&info->lock);
- 			break;
- 		}
- 		spin_unlock(&info->lock);
- 		schedule();
+ 	rcu_read_lock();
+ 	list_for_each_entry_rcu(found, head, list) {
+ 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ 			found->force_alloc = 1;
  	}
- 	finish_wait(&info->flush_wait, &wait);
+ 	rcu_read_unlock();
  }
  
- static void flush_delalloc(struct btrfs_root *root,
- 				 struct btrfs_space_info *info)
+ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+ 			      u64 alloc_bytes)
  {
- 	struct async_flush *async;
- 	bool wait = false;
- 
- 	spin_lock(&info->lock);
- 
- 	if (!info->flushing)
- 		info->flushing = 1;
- 	else
- 		wait = true;
- 
- 	spin_unlock(&info->lock);
- 
- 	if (wait) {
- 		wait_on_flush(info);
- 		return;
- 	}
- 
- 	async = kzalloc(sizeof(*async), GFP_NOFS);
- 	if (!async)
- 		goto flush;
- 
- 	async->root = root;
- 	async->info = info;
- 	async->work.func = flush_delalloc_async;
+ 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
  
- 	btrfs_queue_worker(&root->fs_info->enospc_workers,
- 			   &async->work);
- 	wait_on_flush(info);
- 	return;
+ 	if (sinfo->bytes_used + sinfo->bytes_reserved +
+ 	    alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+ 		return 0;
  
- flush:
- 	btrfs_start_delalloc_inodes(root, 0);
- 	btrfs_wait_ordered_extents(root, 0, 0);
+ 	if (sinfo->bytes_used + sinfo->bytes_reserved +
+ 	    alloc_bytes < div_factor(num_bytes, 8))
+ 		return 0;
  
- 	spin_lock(&info->lock);
- 	info->flushing = 0;
- 	spin_unlock(&info->lock);
- 	wake_up(&info->flush_wait);
+ 	return 1;
  }
  
- static int maybe_allocate_chunk(struct btrfs_root *root,
- 				 struct btrfs_space_info *info)
+ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *extent_root, u64 alloc_bytes,
+ 			  u64 flags, int force)
  {
- 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
- 	struct btrfs_trans_handle *trans;
- 	bool wait = false;
+ 	struct btrfs_space_info *space_info;
+ 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
  	int ret = 0;
- 	u64 min_metadata;
- 	u64 free_space;
  
- 	free_space = btrfs_super_total_bytes(disk_super);
- 	/*
- 	 * we allow the metadata to grow to a max of either 10gb or 5% of the
- 	 * space in the volume.
- 	 */
- 	min_metadata = min((u64)10 * 1024 * 1024 * 1024,
- 			     div64_u64(free_space * 5, 100));
- 	if (info->total_bytes >= min_metadata) {
- 		spin_unlock(&info->lock);
- 		return 0;
- 	}
+ 	mutex_lock(&fs_info->chunk_mutex);
  
- 	if (info->full) {
- 		spin_unlock(&info->lock);
- 		return 0;
+ 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+ 
+ 	space_info = __find_space_info(extent_root->fs_info, flags);
+ 	if (!space_info) {
+ 		ret = update_space_info(extent_root->fs_info, flags,
+ 					0, 0, &space_info);
+ 		BUG_ON(ret);
  	}
+ 	BUG_ON(!space_info);
  
- 	if (!info->allocating_chunk) {
- 		info->force_alloc = 1;
- 		info->allocating_chunk = 1;
- 	} else {
- 		wait = true;
+ 	spin_lock(&space_info->lock);
+ 	if (space_info->force_alloc)
+ 		force = 1;
+ 	if (space_info->full) {
+ 		spin_unlock(&space_info->lock);
+ 		goto out;
  	}
  
- 	spin_unlock(&info->lock);
- 
- 	if (wait) {
- 		wait_event(info->allocate_wait,
- 			   !info->allocating_chunk);
- 		return 1;
+ 	if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+ 		spin_unlock(&space_info->lock);
+ 		goto out;
  	}
+ 	spin_unlock(&space_info->lock);
  
- 	trans = btrfs_start_transaction(root, 1);
- 	if (!trans) {
- 		ret = -ENOMEM;
- 		goto out;
+ 	/*
+ 	 * if we're doing a data chunk, go ahead and make sure that
+ 	 * we keep a reasonable number of metadata chunks allocated in the
+ 	 * FS as well.
+ 	 */
+ 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ 		fs_info->data_chunk_allocations++;
+ 		if (!(fs_info->data_chunk_allocations %
+ 		      fs_info->metadata_ratio))
+ 			force_metadata_allocation(fs_info);
  	}
  
- 	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 			     4096 + 2 * 1024 * 1024,
- 			     info->flags, 0);
- 	btrfs_end_transaction(trans, root);
+ 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ 	spin_lock(&space_info->lock);
  	if (ret)
- 		goto out;
+ 		space_info->full = 1;
+ 	else
+ 		ret = 1;
+ 	space_info->force_alloc = 0;
+ 	spin_unlock(&space_info->lock);
  out:
- 	spin_lock(&info->lock);
- 	info->allocating_chunk = 0;
- 	spin_unlock(&info->lock);
- 	wake_up(&info->allocate_wait);
+ 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ 	return ret;
+ }
  
- 	if (ret)
+ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root,
+ 				struct btrfs_space_info *sinfo, u64 num_bytes)
+ {
+ 	int ret;
+ 	int end_trans = 0;
+ 
+ 	if (sinfo->full)
  		return 0;
- 	return 1;
+ 
+ 	spin_lock(&sinfo->lock);
+ 	ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+ 	spin_unlock(&sinfo->lock);
+ 	if (!ret)
+ 		return 0;
+ 
+ 	if (!trans) {
+ 		trans = btrfs_join_transaction(root, 1);
+ 		BUG_ON(IS_ERR(trans));
+ 		end_trans = 1;
+ 	}
+ 
+ 	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 			     num_bytes + 2 * 1024 * 1024,
+ 			     get_alloc_profile(root, sinfo->flags), 0);
+ 
+ 	if (end_trans)
+ 		btrfs_end_transaction(trans, root);
+ 
+ 	return ret == 1 ? 1 : 0;
  }
  
  /*
-  * Reserve metadata space for delalloc.
+  * shrink metadata reservation for delalloc
   */
- int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
- 					struct inode *inode, int num_items)
+ static int shrink_delalloc(struct btrfs_trans_handle *trans,
+ 			   struct btrfs_root *root, u64 to_reclaim)
+ {
+ 	struct btrfs_block_rsv *block_rsv;
+ 	u64 reserved;
+ 	u64 max_reclaim;
+ 	u64 reclaimed = 0;
+ 	int pause = 1;
+ 	int ret;
+ 
+ 	block_rsv = &root->fs_info->delalloc_block_rsv;
+ 	spin_lock(&block_rsv->lock);
+ 	reserved = block_rsv->reserved;
+ 	spin_unlock(&block_rsv->lock);
+ 
+ 	if (reserved == 0)
+ 		return 0;
+ 
+ 	max_reclaim = min(reserved, to_reclaim);
+ 
+ 	while (1) {
+ 		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+ 		if (!ret) {
+ 			__set_current_state(TASK_INTERRUPTIBLE);
+ 			schedule_timeout(pause);
+ 			pause <<= 1;
+ 			if (pause > HZ / 10)
+ 				pause = HZ / 10;
+ 		} else {
+ 			pause = 1;
+ 		}
+ 
+ 		spin_lock(&block_rsv->lock);
+ 		if (reserved > block_rsv->reserved)
+ 			reclaimed = reserved - block_rsv->reserved;
+ 		reserved = block_rsv->reserved;
+ 		spin_unlock(&block_rsv->lock);
+ 
+ 		if (reserved == 0 || reclaimed >= max_reclaim)
+ 			break;
+ 
+ 		if (trans && trans->transaction->blocked)
+ 			return -EAGAIN;
+ 	}
+ 	return reclaimed >= to_reclaim;
+ }
+ 
+ static int should_retry_reserve(struct btrfs_trans_handle *trans,
+ 				struct btrfs_root *root,
+ 				struct btrfs_block_rsv *block_rsv,
+ 				u64 num_bytes, int *retries)
  {
- 	struct btrfs_fs_info *info = root->fs_info;
- 	struct btrfs_space_info *meta_sinfo;
- 	u64 num_bytes;
- 	u64 used;
- 	u64 alloc_target;
- 	int flushed = 0;
- 	int force_delalloc;
+ 	struct btrfs_space_info *space_info = block_rsv->space_info;
+ 	int ret;
  
- 	/* get the space info for where the metadata will live */
- 	alloc_target = btrfs_get_alloc_profile(root, 0);
- 	meta_sinfo = __find_space_info(info, alloc_target);
+ 	if ((*retries) > 2)
+ 		return -ENOSPC;
  
- 	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- 					   num_items);
- again:
- 	spin_lock(&meta_sinfo->lock);
+ 	ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+ 	if (ret)
+ 		return 1;
  
- 	force_delalloc = meta_sinfo->force_delalloc;
+ 	if (trans && trans->transaction->in_commit)
+ 		return -ENOSPC;
  
- 	if (unlikely(!meta_sinfo->bytes_root))
- 		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+ 	ret = shrink_delalloc(trans, root, num_bytes);
+ 	if (ret)
+ 		return ret;
  
- 	if (!flushed)
- 		meta_sinfo->bytes_delalloc += num_bytes;
+ 	spin_lock(&space_info->lock);
+ 	if (space_info->bytes_pinned < num_bytes)
+ 		ret = 1;
+ 	spin_unlock(&space_info->lock);
+ 	if (ret)
+ 		return -ENOSPC;
  
- 	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- 		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- 		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- 		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+ 	(*retries)++;
  
- 	if (used > meta_sinfo->total_bytes) {
- 		flushed++;
+ 	if (trans)
+ 		return -EAGAIN;
  
- 		if (flushed == 1) {
- 			if (maybe_allocate_chunk(root, meta_sinfo))
- 				goto again;
- 			flushed++;
+ 	trans = btrfs_join_transaction(root, 1);
+ 	BUG_ON(IS_ERR(trans));
+ 	ret = btrfs_commit_transaction(trans, root);
+ 	BUG_ON(ret);
+ 
+ 	return 1;
+ }
+ 
+ static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+ 				  u64 num_bytes)
+ {
+ 	struct btrfs_space_info *space_info = block_rsv->space_info;
+ 	u64 unused;
+ 	int ret = -ENOSPC;
+ 
+ 	spin_lock(&space_info->lock);
+ 	unused = space_info->bytes_used + space_info->bytes_reserved +
+ 		 space_info->bytes_pinned + space_info->bytes_readonly;
+ 
+ 	if (unused < space_info->total_bytes)
+ 		unused = space_info->total_bytes - unused;
+ 	else
+ 		unused = 0;
+ 
+ 	if (unused >= num_bytes) {
+ 		if (block_rsv->priority >= 10) {
+ 			space_info->bytes_reserved += num_bytes;
+ 			ret = 0;
  		} else {
- 			spin_unlock(&meta_sinfo->lock);
+ 			if ((unused + block_rsv->reserved) *
+ 			    block_rsv->priority >=
+ 			    (num_bytes + block_rsv->reserved) * 10) {
+ 				space_info->bytes_reserved += num_bytes;
+ 				ret = 0;
+ 			}
  		}
+ 	}
+ 	spin_unlock(&space_info->lock);
  
- 		if (flushed == 2) {
- 			filemap_flush(inode->i_mapping);
- 			goto again;
- 		} else if (flushed == 3) {
- 			flush_delalloc(root, meta_sinfo);
- 			goto again;
+ 	return ret;
+ }
+ 
+ static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+ 					     struct btrfs_root *root)
+ {
+ 	struct btrfs_block_rsv *block_rsv;
+ 	if (root->ref_cows)
+ 		block_rsv = trans->block_rsv;
+ 	else
+ 		block_rsv = root->block_rsv;
+ 
+ 	if (!block_rsv)
+ 		block_rsv = &root->fs_info->empty_block_rsv;
+ 
+ 	return block_rsv;
+ }
+ 
+ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ 			       u64 num_bytes)
+ {
+ 	int ret = -ENOSPC;
+ 	spin_lock(&block_rsv->lock);
+ 	if (block_rsv->reserved >= num_bytes) {
+ 		block_rsv->reserved -= num_bytes;
+ 		if (block_rsv->reserved < block_rsv->size)
+ 			block_rsv->full = 0;
+ 		ret = 0;
+ 	}
+ 	spin_unlock(&block_rsv->lock);
+ 	return ret;
+ }
+ 
+ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ 				u64 num_bytes, int update_size)
+ {
+ 	spin_lock(&block_rsv->lock);
+ 	block_rsv->reserved += num_bytes;
+ 	if (update_size)
+ 		block_rsv->size += num_bytes;
+ 	else if (block_rsv->reserved >= block_rsv->size)
+ 		block_rsv->full = 1;
+ 	spin_unlock(&block_rsv->lock);
+ }
+ 
+ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+ 			     struct btrfs_block_rsv *dest, u64 num_bytes)
+ {
+ 	struct btrfs_space_info *space_info = block_rsv->space_info;
+ 
+ 	spin_lock(&block_rsv->lock);
+ 	if (num_bytes == (u64)-1)
+ 		num_bytes = block_rsv->size;
+ 	block_rsv->size -= num_bytes;
+ 	if (block_rsv->reserved >= block_rsv->size) {
+ 		num_bytes = block_rsv->reserved - block_rsv->size;
+ 		block_rsv->reserved = block_rsv->size;
+ 		block_rsv->full = 1;
+ 	} else {
+ 		num_bytes = 0;
+ 	}
+ 	spin_unlock(&block_rsv->lock);
+ 
+ 	if (num_bytes > 0) {
+ 		if (dest) {
+ 			block_rsv_add_bytes(dest, num_bytes, 0);
+ 		} else {
+ 			spin_lock(&space_info->lock);
+ 			space_info->bytes_reserved -= num_bytes;
+ 			spin_unlock(&space_info->lock);
  		}
- 		spin_lock(&meta_sinfo->lock);
- 		meta_sinfo->bytes_delalloc -= num_bytes;
- 		spin_unlock(&meta_sinfo->lock);
- 		printk(KERN_ERR "enospc, has %d, reserved %d\n",
- 		       BTRFS_I(inode)->outstanding_extents,
- 		       BTRFS_I(inode)->reserved_extents);
- 		dump_space_info(meta_sinfo, 0, 0);
- 		return -ENOSPC;
  	}
+ }
  
- 	BTRFS_I(inode)->reserved_extents += num_items;
- 	check_force_delalloc(meta_sinfo);
- 	spin_unlock(&meta_sinfo->lock);
+ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+ 				   struct btrfs_block_rsv *dst, u64 num_bytes)
+ {
+ 	int ret;
  
- 	if (!flushed && force_delalloc)
- 		filemap_flush(inode->i_mapping);
+ 	ret = block_rsv_use_bytes(src, num_bytes);
+ 	if (ret)
+ 		return ret;
  
+ 	block_rsv_add_bytes(dst, num_bytes, 1);
  	return 0;
  }
  
- /*
-  * unreserve num_items number of items worth of metadata space.  This needs to
-  * be paired with btrfs_reserve_metadata_space.
-  *
-  * NOTE: if you have the option, run this _AFTER_ you do a
-  * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
-  * oprations which will result in more used metadata, so we want to make sure we
-  * can do that without issue.
-  */
- int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
  {
- 	struct btrfs_fs_info *info = root->fs_info;
- 	struct btrfs_space_info *meta_sinfo;
- 	u64 num_bytes;
+ 	memset(rsv, 0, sizeof(*rsv));
+ 	spin_lock_init(&rsv->lock);
+ 	atomic_set(&rsv->usage, 1);
+ 	rsv->priority = 6;
+ 	INIT_LIST_HEAD(&rsv->list);
+ }
+ 
+ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+ {
+ 	struct btrfs_block_rsv *block_rsv;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
  	u64 alloc_target;
- 	bool bug = false;
  
- 	/* get the space info for where the metadata will live */
- 	alloc_target = btrfs_get_alloc_profile(root, 0);
- 	meta_sinfo = __find_space_info(info, alloc_target);
+ 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+ 	if (!block_rsv)
+ 		return NULL;
  
- 	num_bytes = calculate_bytes_needed(root, num_items);
+ 	btrfs_init_block_rsv(block_rsv);
  
- 	spin_lock(&meta_sinfo->lock);
- 	if (meta_sinfo->bytes_may_use < num_bytes) {
- 		bug = true;
- 		meta_sinfo->bytes_may_use = 0;
- 	} else {
- 		meta_sinfo->bytes_may_use -= num_bytes;
- 	}
- 	spin_unlock(&meta_sinfo->lock);
+ 	alloc_target = btrfs_get_alloc_profile(root, 0);
+ 	block_rsv->space_info = __find_space_info(fs_info,
+ 						  BTRFS_BLOCK_GROUP_METADATA);
  
- 	BUG_ON(bug);
+ 	return block_rsv;
+ }
  
- 	return 0;
+ void btrfs_free_block_rsv(struct btrfs_root *root,
+ 			  struct btrfs_block_rsv *rsv)
+ {
+ 	if (rsv && atomic_dec_and_test(&rsv->usage)) {
+ 		btrfs_block_rsv_release(root, rsv, (u64)-1);
+ 		if (!rsv->durable)
+ 			kfree(rsv);
+ 	}
  }
  
  /*
-  * Reserve some metadata space for use.  We'll calculate the worste case number
-  * of bytes that would be needed to modify num_items number of items.  If we
-  * have space, fantastic, if not, you get -ENOSPC.  Please call
-  * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
-  * items you reserved, since whatever metadata you needed should have already
-  * been allocated.
-  *
-  * This will commit the transaction to make more space if we don't have enough
-  * metadata space.  THe only time we don't do this is if we're reserving space
-  * inside of a transaction, then we will just return -ENOSPC and it is the
-  * callers responsibility to handle it properly.
+  * make the block_rsv struct be able to capture freed space.
+  * the captured space will re-add to the the block_rsv struct
+  * after transaction commit
   */
- int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+ 				 struct btrfs_block_rsv *block_rsv)
  {
- 	struct btrfs_fs_info *info = root->fs_info;
- 	struct btrfs_space_info *meta_sinfo;
- 	u64 num_bytes;
- 	u64 used;
- 	u64 alloc_target;
- 	int retries = 0;
+ 	block_rsv->durable = 1;
+ 	mutex_lock(&fs_info->durable_block_rsv_mutex);
+ 	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+ 	mutex_unlock(&fs_info->durable_block_rsv_mutex);
+ }
  
- 	/* get the space info for where the metadata will live */
- 	alloc_target = btrfs_get_alloc_profile(root, 0);
- 	meta_sinfo = __find_space_info(info, alloc_target);
+ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+ 			struct btrfs_root *root,
+ 			struct btrfs_block_rsv *block_rsv,
+ 			u64 num_bytes, int *retries)
+ {
+ 	int ret;
  
- 	num_bytes = calculate_bytes_needed(root, num_items);
+ 	if (num_bytes == 0)
+ 		return 0;
  again:
- 	spin_lock(&meta_sinfo->lock);
+ 	ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ 	if (!ret) {
+ 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
+ 		return 0;
+ 	}
  
- 	if (unlikely(!meta_sinfo->bytes_root))
- 		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+ 	ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+ 	if (ret > 0)
+ 		goto again;
+ 
+ 	return ret;
+ }
  
- 	if (!retries)
- 		meta_sinfo->bytes_may_use += num_bytes;
+ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+ 			  struct btrfs_root *root,
+ 			  struct btrfs_block_rsv *block_rsv,
+ 			  u64 min_reserved, int min_factor)
+ {
+ 	u64 num_bytes = 0;
+ 	int commit_trans = 0;
+ 	int ret = -ENOSPC;
  
- 	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- 		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- 		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- 		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+ 	if (!block_rsv)
+ 		return 0;
  
- 	if (used > meta_sinfo->total_bytes) {
- 		retries++;
- 		if (retries == 1) {
- 			if (maybe_allocate_chunk(root, meta_sinfo))
- 				goto again;
- 			retries++;
- 		} else {
- 			spin_unlock(&meta_sinfo->lock);
- 		}
+ 	spin_lock(&block_rsv->lock);
+ 	if (min_factor > 0)
+ 		num_bytes = div_factor(block_rsv->size, min_factor);
+ 	if (min_reserved > num_bytes)
+ 		num_bytes = min_reserved;
  
- 		if (retries == 2) {
- 			flush_delalloc(root, meta_sinfo);
- 			goto again;
+ 	if (block_rsv->reserved >= num_bytes) {
+ 		ret = 0;
+ 	} else {
+ 		num_bytes -= block_rsv->reserved;
+ 		if (block_rsv->durable &&
+ 		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+ 			commit_trans = 1;
+ 	}
+ 	spin_unlock(&block_rsv->lock);
+ 	if (!ret)
+ 		return 0;
+ 
+ 	if (block_rsv->refill_used) {
+ 		ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ 		if (!ret) {
+ 			block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ 			return 0;
  		}
- 		spin_lock(&meta_sinfo->lock);
- 		meta_sinfo->bytes_may_use -= num_bytes;
- 		spin_unlock(&meta_sinfo->lock);
+ 	}
  
- 		dump_space_info(meta_sinfo, 0, 0);
- 		return -ENOSPC;
+ 	if (commit_trans) {
+ 		if (trans)
+ 			return -EAGAIN;
+ 
+ 		trans = btrfs_join_transaction(root, 1);
+ 		BUG_ON(IS_ERR(trans));
+ 		ret = btrfs_commit_transaction(trans, root);
+ 		return 0;
  	}
  
- 	check_force_delalloc(meta_sinfo);
- 	spin_unlock(&meta_sinfo->lock);
+ 	WARN_ON(1);
+ 	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ 		block_rsv->size, block_rsv->reserved,
+ 		block_rsv->freed[0], block_rsv->freed[1]);
  
- 	return 0;
+ 	return -ENOSPC;
+ }
+ 
+ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ 			    struct btrfs_block_rsv *dst_rsv,
+ 			    u64 num_bytes)
+ {
+ 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ }
+ 
+ void btrfs_block_rsv_release(struct btrfs_root *root,
+ 			     struct btrfs_block_rsv *block_rsv,
+ 			     u64 num_bytes)
+ {
+ 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ 	if (global_rsv->full || global_rsv == block_rsv ||
+ 	    block_rsv->space_info != global_rsv->space_info)
+ 		global_rsv = NULL;
+ 	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
  }
  
  /*
-  * This will check the space that the inode allocates from to make sure we have
-  * enough space for bytes.
+  * helper to calculate size of global block reservation.
+  * the desired value is sum of space used by extent tree,
+  * checksum tree and root tree
   */
- int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
- 				u64 bytes)
+ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
  {
- 	struct btrfs_space_info *data_sinfo;
- 	u64 used;
- 	int ret = 0, committed = 0, flushed = 0;
+ 	struct btrfs_space_info *sinfo;
+ 	u64 num_bytes;
+ 	u64 meta_used;
+ 	u64 data_used;
+ 	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ #if 0
+ 	/*
+ 	 * per tree used space accounting can be inaccuracy, so we
+ 	 * can't rely on it.
+ 	 */
+ 	spin_lock(&fs_info->extent_root->accounting_lock);
+ 	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+ 	spin_unlock(&fs_info->extent_root->accounting_lock);
  
- 	/* make sure bytes are sectorsize aligned */
- 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ 	spin_lock(&fs_info->csum_root->accounting_lock);
+ 	num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+ 	spin_unlock(&fs_info->csum_root->accounting_lock);
  
- 	data_sinfo = BTRFS_I(inode)->space_info;
- 	if (!data_sinfo)
- 		goto alloc;
+ 	spin_lock(&fs_info->tree_root->accounting_lock);
+ 	num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+ 	spin_unlock(&fs_info->tree_root->accounting_lock);
+ #endif
+ 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ 	spin_lock(&sinfo->lock);
+ 	data_used = sinfo->bytes_used;
+ 	spin_unlock(&sinfo->lock);
  
- again:
- 	/* make sure we have enough space to handle the data first */
- 	spin_lock(&data_sinfo->lock);
- 	used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
- 		data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
- 		data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
- 		data_sinfo->bytes_super;
+ 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ 	spin_lock(&sinfo->lock);
+ 	meta_used = sinfo->bytes_used;
+ 	spin_unlock(&sinfo->lock);
  
- 	if (used + bytes > data_sinfo->total_bytes) {
- 		struct btrfs_trans_handle *trans;
+ 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+ 		    csum_size * 2;
+ 	num_bytes += div64_u64(data_used + meta_used, 50);
  
- 		if (!flushed) {
- 			spin_unlock(&data_sinfo->lock);
- 			flush_delalloc(root, data_sinfo);
- 			flushed = 1;
- 			goto again;
- 		}
+ 	if (num_bytes * 3 > meta_used)
+ 		num_bytes = div64_u64(meta_used, 3);
  
- 		/*
- 		 * if we don't have enough free bytes in this space then we need
- 		 * to alloc a new chunk.
- 		 */
- 		if (!data_sinfo->full) {
- 			u64 alloc_target;
+ 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ }
  
- 			data_sinfo->force_alloc = 1;
- 			spin_unlock(&data_sinfo->lock);
- alloc:
- 			alloc_target = btrfs_get_alloc_profile(root, 1);
- 			trans = btrfs_start_transaction(root, 1);
- 			if (!trans)
- 				return -ENOMEM;
+ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+ {
+ 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ 	struct btrfs_space_info *sinfo = block_rsv->space_info;
+ 	u64 num_bytes;
  
- 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 					     bytes + 2 * 1024 * 1024,
- 					     alloc_target, 0);
- 			btrfs_end_transaction(trans, root);
- 			if (ret)
- 				return ret;
+ 	num_bytes = calc_global_metadata_size(fs_info);
  
- 			if (!data_sinfo) {
- 				btrfs_set_inode_space_info(root, inode);
- 				data_sinfo = BTRFS_I(inode)->space_info;
- 			}
- 			goto again;
- 		}
- 		spin_unlock(&data_sinfo->lock);
+ 	spin_lock(&block_rsv->lock);
+ 	spin_lock(&sinfo->lock);
  
- 		/* commit the current transaction and try again */
- 		if (!committed && !root->fs_info->open_ioctl_trans) {
- 			committed = 1;
- 			trans = btrfs_join_transaction(root, 1);
- 			if (!trans)
- 				return -ENOMEM;
- 			ret = btrfs_commit_transaction(trans, root);
- 			if (ret)
- 				return ret;
- 			goto again;
- 		}
+ 	block_rsv->size = num_bytes;
  
- 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
- 		       ", %llu bytes_used, %llu bytes_reserved, "
- 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
- 		       "%llu total\n", (unsigned long long)bytes,
- 		       (unsigned long long)data_sinfo->bytes_delalloc,
- 		       (unsigned long long)data_sinfo->bytes_used,
- 		       (unsigned long long)data_sinfo->bytes_reserved,
- 		       (unsigned long long)data_sinfo->bytes_pinned,
- 		       (unsigned long long)data_sinfo->bytes_readonly,
- 		       (unsigned long long)data_sinfo->bytes_may_use,
- 		       (unsigned long long)data_sinfo->total_bytes);
- 		return -ENOSPC;
+ 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+ 		    sinfo->bytes_reserved + sinfo->bytes_readonly;
+ 
+ 	if (sinfo->total_bytes > num_bytes) {
+ 		num_bytes = sinfo->total_bytes - num_bytes;
+ 		block_rsv->reserved += num_bytes;
+ 		sinfo->bytes_reserved += num_bytes;
  	}
- 	data_sinfo->bytes_may_use += bytes;
- 	BTRFS_I(inode)->reserved_bytes += bytes;
- 	spin_unlock(&data_sinfo->lock);
  
- 	return 0;
+ 	if (block_rsv->reserved >= block_rsv->size) {
+ 		num_bytes = block_rsv->reserved - block_rsv->size;
+ 		sinfo->bytes_reserved -= num_bytes;
+ 		block_rsv->reserved = block_rsv->size;
+ 		block_rsv->full = 1;
+ 	}
+ #if 0
+ 	printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+ 		block_rsv->size, block_rsv->reserved);
+ #endif
+ 	spin_unlock(&sinfo->lock);
+ 	spin_unlock(&block_rsv->lock);
  }
  
- /*
-  * if there was an error for whatever reason after calling
-  * btrfs_check_data_free_space, call this so we can cleanup the counters.
-  */
- void btrfs_free_reserved_data_space(struct btrfs_root *root,
- 				    struct inode *inode, u64 bytes)
+ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
  {
- 	struct btrfs_space_info *data_sinfo;
+ 	struct btrfs_space_info *space_info;
  
- 	/* make sure bytes are sectorsize aligned */
- 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ 	fs_info->chunk_block_rsv.space_info = space_info;
+ 	fs_info->chunk_block_rsv.priority = 10;
  
- 	data_sinfo = BTRFS_I(inode)->space_info;
- 	spin_lock(&data_sinfo->lock);
- 	data_sinfo->bytes_may_use -= bytes;
- 	BTRFS_I(inode)->reserved_bytes -= bytes;
- 	spin_unlock(&data_sinfo->lock);
+ 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ 	fs_info->global_block_rsv.space_info = space_info;
+ 	fs_info->global_block_rsv.priority = 10;
+ 	fs_info->global_block_rsv.refill_used = 1;
+ 	fs_info->delalloc_block_rsv.space_info = space_info;
+ 	fs_info->trans_block_rsv.space_info = space_info;
+ 	fs_info->empty_block_rsv.space_info = space_info;
+ 	fs_info->empty_block_rsv.priority = 10;
+ 
+ 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+ 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+ 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+ 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+ 
+ 	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+ 
+ 	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+ 
+ 	update_global_block_rsv(fs_info);
  }
  
- /* called when we are adding a delalloc extent to the inode's io_tree */
- void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
- 				  u64 bytes)
+ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
  {
- 	struct btrfs_space_info *data_sinfo;
+ 	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+ 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+ 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+ 	WARN_ON(fs_info->trans_block_rsv.size > 0);
+ 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+ 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
+ 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+ }
  
- 	/* get the space info for where this inode will be storing its data */
- 	data_sinfo = BTRFS_I(inode)->space_info;
+ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+ {
+ 	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ 		3 * num_items;
+ }
  
- 	/* make sure we have enough space to handle the data first */
- 	spin_lock(&data_sinfo->lock);
- 	data_sinfo->bytes_delalloc += bytes;
+ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+ 				 struct btrfs_root *root,
+ 				 int num_items, int *retries)
+ {
+ 	u64 num_bytes;
+ 	int ret;
  
- 	/*
- 	 * we are adding a delalloc extent without calling
- 	 * btrfs_check_data_free_space first.  This happens on a weird
- 	 * writepage condition, but shouldn't hurt our accounting
- 	 */
- 	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
- 		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
- 		BTRFS_I(inode)->reserved_bytes = 0;
- 	} else {
- 		data_sinfo->bytes_may_use -= bytes;
- 		BTRFS_I(inode)->reserved_bytes -= bytes;
- 	}
+ 	if (num_items == 0 || root->fs_info->chunk_root == root)
+ 		return 0;
  
- 	spin_unlock(&data_sinfo->lock);
+ 	num_bytes = calc_trans_metadata_size(root, num_items);
+ 	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+ 				  num_bytes, retries);
+ 	if (!ret) {
+ 		trans->bytes_reserved += num_bytes;
+ 		trans->block_rsv = &root->fs_info->trans_block_rsv;
+ 	}
+ 	return ret;
  }
  
- /* called when we are clearing an delalloc extent from the inode's io_tree */
- void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
- 			      u64 bytes)
+ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ 				  struct btrfs_root *root)
  {
- 	struct btrfs_space_info *info;
+ 	if (!trans->bytes_reserved)
+ 		return;
  
- 	info = BTRFS_I(inode)->space_info;
+ 	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+ 	btrfs_block_rsv_release(root, trans->block_rsv,
+ 				trans->bytes_reserved);
+ 	trans->bytes_reserved = 0;
+ }
  
- 	spin_lock(&info->lock);
- 	info->bytes_delalloc -= bytes;
- 	spin_unlock(&info->lock);
+ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ 				  struct inode *inode)
+ {
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+ 
+ 	/*
+ 	 * one for deleting orphan item, one for updating inode and
+ 	 * two for calling btrfs_truncate_inode_items.
+ 	 *
+ 	 * btrfs_truncate_inode_items is a delete operation, it frees
+ 	 * more space than it uses in most cases. So two units of
+ 	 * metadata space should be enough for calling it many times.
+ 	 * If all of the metadata space is used, we can commit
+ 	 * transaction and use space it freed.
+ 	 */
+ 	u64 num_bytes = calc_trans_metadata_size(root, 4);
+ 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
  }
  
- static void force_metadata_allocation(struct btrfs_fs_info *info)
+ void btrfs_orphan_release_metadata(struct inode *inode)
  {
- 	struct list_head *head = &info->space_info;
- 	struct btrfs_space_info *found;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	u64 num_bytes = calc_trans_metadata_size(root, 4);
+ 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+ }
  
- 	rcu_read_lock();
- 	list_for_each_entry_rcu(found, head, list) {
- 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
- 			found->force_alloc = 1;
- 	}
- 	rcu_read_unlock();
+ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+ 				struct btrfs_pending_snapshot *pending)
+ {
+ 	struct btrfs_root *root = pending->root;
+ 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+ 	/*
+ 	 * two for root back/forward refs, two for directory entries
+ 	 * and one for root of the snapshot.
+ 	 */
+ 	u64 num_bytes = calc_trans_metadata_size(root, 5);
+ 	dst_rsv->space_info = src_rsv->space_info;
+ 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
  }
  
- static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- 			  struct btrfs_root *extent_root, u64 alloc_bytes,
- 			  u64 flags, int force)
+ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
  {
- 	struct btrfs_space_info *space_info;
- 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
- 	u64 thresh;
- 	int ret = 0;
+ 	return num_bytes >>= 3;
+ }
  
- 	mutex_lock(&fs_info->chunk_mutex);
+ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+ {
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+ 	u64 to_reserve;
+ 	int nr_extents;
+ 	int retries = 0;
+ 	int ret;
  
- 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+ 	if (btrfs_transaction_in_commit(root->fs_info))
+ 		schedule_timeout(1);
  
- 	space_info = __find_space_info(extent_root->fs_info, flags);
- 	if (!space_info) {
- 		ret = update_space_info(extent_root->fs_info, flags,
- 					0, 0, &space_info);
- 		BUG_ON(ret);
+ 	num_bytes = ALIGN(num_bytes, root->sectorsize);
+ again:
+ 	spin_lock(&BTRFS_I(inode)->accounting_lock);
+ 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+ 	if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+ 		nr_extents -= BTRFS_I(inode)->reserved_extents;
+ 		to_reserve = calc_trans_metadata_size(root, nr_extents);
+ 	} else {
+ 		nr_extents = 0;
+ 		to_reserve = 0;
  	}
- 	BUG_ON(!space_info);
  
- 	spin_lock(&space_info->lock);
- 	if (space_info->force_alloc)
- 		force = 1;
- 	if (space_info->full) {
- 		spin_unlock(&space_info->lock);
- 		goto out;
+ 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
+ 	ret = reserve_metadata_bytes(block_rsv, to_reserve);
+ 	if (ret) {
+ 		spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ 		ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+ 					   &retries);
+ 		if (ret > 0)
+ 			goto again;
+ 		return ret;
  	}
  
- 	thresh = space_info->total_bytes - space_info->bytes_readonly;
- 	thresh = div_factor(thresh, 8);
- 	if (!force &&
- 	   (space_info->bytes_used + space_info->bytes_pinned +
- 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
- 		spin_unlock(&space_info->lock);
- 		goto out;
- 	}
- 	spin_unlock(&space_info->lock);
+ 	BTRFS_I(inode)->reserved_extents += nr_extents;
+ 	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
  
- 	/*
- 	 * if we're doing a data chunk, go ahead and make sure that
- 	 * we keep a reasonable number of metadata chunks allocated in the
- 	 * FS as well.
- 	 */
- 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
- 		fs_info->data_chunk_allocations++;
- 		if (!(fs_info->data_chunk_allocations %
- 		      fs_info->metadata_ratio))
- 			force_metadata_allocation(fs_info);
+ 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
+ 
+ 	if (block_rsv->size > 512 * 1024 * 1024)
+ 		shrink_delalloc(NULL, root, to_reserve);
+ 
+ 	return 0;
+ }
+ 
+ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+ {
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	u64 to_free;
+ 	int nr_extents;
+ 
+ 	num_bytes = ALIGN(num_bytes, root->sectorsize);
+ 	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+ 
+ 	spin_lock(&BTRFS_I(inode)->accounting_lock);
+ 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+ 	if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+ 		nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+ 		BTRFS_I(inode)->reserved_extents -= nr_extents;
+ 	} else {
+ 		nr_extents = 0;
  	}
+ 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
  
- 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
- 	spin_lock(&space_info->lock);
+ 	to_free = calc_csum_metadata_size(inode, num_bytes);
+ 	if (nr_extents > 0)
+ 		to_free += calc_trans_metadata_size(root, nr_extents);
+ 
+ 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+ 				to_free);
+ }
+ 
+ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+ {
+ 	int ret;
+ 
+ 	ret = btrfs_check_data_free_space(inode, num_bytes);
  	if (ret)
- 		space_info->full = 1;
- 	space_info->force_alloc = 0;
- 	spin_unlock(&space_info->lock);
- out:
- 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
- 	return ret;
+ 		return ret;
+ 
+ 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+ 	if (ret) {
+ 		btrfs_free_reserved_data_space(inode, num_bytes);
+ 		return ret;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+ {
+ 	btrfs_delalloc_release_metadata(inode, num_bytes);
+ 	btrfs_free_reserved_data_space(inode, num_bytes);
  }
  
  static int update_block_group(struct btrfs_trans_handle *trans,
  			      struct btrfs_root *root,
- 			      u64 bytenr, u64 num_bytes, int alloc,
- 			      int mark_free)
+ 			      u64 bytenr, u64 num_bytes, int alloc)
  {
  	struct btrfs_block_group_cache *cache;
  	struct btrfs_fs_info *info = root->fs_info;
+ 	int factor;
  	u64 total = num_bytes;
  	u64 old_val;
  	u64 byte_in_group;
@@@ -3486,6 -3798,12 +3798,12 @@@
  		cache = btrfs_lookup_block_group(info, bytenr);
  		if (!cache)
  			return -1;
+ 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+ 				    BTRFS_BLOCK_GROUP_RAID1 |
+ 				    BTRFS_BLOCK_GROUP_RAID10))
+ 			factor = 2;
+ 		else
+ 			factor = 1;
  		byte_in_group = bytenr - cache->key.objectid;
  		WARN_ON(byte_in_group > cache->key.offset);
  
@@@ -3498,31 -3816,24 +3816,24 @@@
  			old_val += num_bytes;
  			btrfs_set_block_group_used(&cache->item, old_val);
  			cache->reserved -= num_bytes;
- 			cache->space_info->bytes_used += num_bytes;
  			cache->space_info->bytes_reserved -= num_bytes;
- 			if (cache->ro)
- 				cache->space_info->bytes_readonly -= num_bytes;
+ 			cache->space_info->bytes_used += num_bytes;
+ 			cache->space_info->disk_used += num_bytes * factor;
  			spin_unlock(&cache->lock);
  			spin_unlock(&cache->space_info->lock);
  		} else {
  			old_val -= num_bytes;
- 			cache->space_info->bytes_used -= num_bytes;
- 			if (cache->ro)
- 				cache->space_info->bytes_readonly += num_bytes;
  			btrfs_set_block_group_used(&cache->item, old_val);
+ 			cache->pinned += num_bytes;
+ 			cache->space_info->bytes_pinned += num_bytes;
+ 			cache->space_info->bytes_used -= num_bytes;
+ 			cache->space_info->disk_used -= num_bytes * factor;
  			spin_unlock(&cache->lock);
  			spin_unlock(&cache->space_info->lock);
- 			if (mark_free) {
- 				int ret;
  
- 				ret = btrfs_discard_extent(root, bytenr,
- 							   num_bytes);
- 				WARN_ON(ret);
- 
- 				ret = btrfs_add_free_space(cache, bytenr,
- 							   num_bytes);
- 				WARN_ON(ret);
- 			}
+ 			set_extent_dirty(info->pinned_extents,
+ 					 bytenr, bytenr + num_bytes - 1,
+ 					 GFP_NOFS | __GFP_NOFAIL);
  		}
  		btrfs_put_block_group(cache);
  		total -= num_bytes;
@@@ -3546,18 -3857,10 +3857,10 @@@ static u64 first_logical_byte(struct bt
  	return bytenr;
  }
  
- /*
-  * this function must be called within transaction
-  */
- int btrfs_pin_extent(struct btrfs_root *root,
- 		     u64 bytenr, u64 num_bytes, int reserved)
+ static int pin_down_extent(struct btrfs_root *root,
+ 			   struct btrfs_block_group_cache *cache,
+ 			   u64 bytenr, u64 num_bytes, int reserved)
  {
- 	struct btrfs_fs_info *fs_info = root->fs_info;
- 	struct btrfs_block_group_cache *cache;
- 
- 	cache = btrfs_lookup_block_group(fs_info, bytenr);
- 	BUG_ON(!cache);
- 
  	spin_lock(&cache->space_info->lock);
  	spin_lock(&cache->lock);
  	cache->pinned += num_bytes;
@@@ -3569,28 -3872,68 +3872,68 @@@
  	spin_unlock(&cache->lock);
  	spin_unlock(&cache->space_info->lock);
  
- 	btrfs_put_block_group(cache);
+ 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+ 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ 	return 0;
+ }
+ 
+ /*
+  * this function must be called within transaction
+  */
+ int btrfs_pin_extent(struct btrfs_root *root,
+ 		     u64 bytenr, u64 num_bytes, int reserved)
+ {
+ 	struct btrfs_block_group_cache *cache;
  
- 	set_extent_dirty(fs_info->pinned_extents,
- 			 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ 	BUG_ON(!cache);
+ 
+ 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+ 
+ 	btrfs_put_block_group(cache);
  	return 0;
  }
  
- static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- 				   u64 num_bytes, int reserve)
+ /*
+  * update size of reserved extents. this function may return -EAGAIN
+  * if 'reserve' is true or 'sinfo' is false.
+  */
+ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ 				 u64 num_bytes, int reserve, int sinfo)
  {
- 	spin_lock(&cache->space_info->lock);
- 	spin_lock(&cache->lock);
- 	if (reserve) {
- 		cache->reserved += num_bytes;
- 		cache->space_info->bytes_reserved += num_bytes;
+ 	int ret = 0;
+ 	if (sinfo) {
+ 		struct btrfs_space_info *space_info = cache->space_info;
+ 		spin_lock(&space_info->lock);
+ 		spin_lock(&cache->lock);
+ 		if (reserve) {
+ 			if (cache->ro) {
+ 				ret = -EAGAIN;
+ 			} else {
+ 				cache->reserved += num_bytes;
+ 				space_info->bytes_reserved += num_bytes;
+ 			}
+ 		} else {
+ 			if (cache->ro)
+ 				space_info->bytes_readonly += num_bytes;
+ 			cache->reserved -= num_bytes;
+ 			space_info->bytes_reserved -= num_bytes;
+ 		}
+ 		spin_unlock(&cache->lock);
+ 		spin_unlock(&space_info->lock);
  	} else {
- 		cache->reserved -= num_bytes;
- 		cache->space_info->bytes_reserved -= num_bytes;
+ 		spin_lock(&cache->lock);
+ 		if (cache->ro) {
+ 			ret = -EAGAIN;
+ 		} else {
+ 			if (reserve)
+ 				cache->reserved += num_bytes;
+ 			else
+ 				cache->reserved -= num_bytes;
+ 		}
+ 		spin_unlock(&cache->lock);
  	}
- 	spin_unlock(&cache->lock);
- 	spin_unlock(&cache->space_info->lock);
- 	return 0;
+ 	return ret;
  }
  
  int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@@ -3621,6 -3964,8 +3964,8 @@@
  		fs_info->pinned_extents = &fs_info->freed_extents[0];
  
  	up_write(&fs_info->extent_commit_sem);
+ 
+ 	update_global_block_rsv(fs_info);
  	return 0;
  }
  
@@@ -3647,14 -3992,21 +3992,21 @@@ static int unpin_extent_range(struct bt
  			btrfs_add_free_space(cache, start, len);
  		}
  
+ 		start += len;
+ 
  		spin_lock(&cache->space_info->lock);
  		spin_lock(&cache->lock);
  		cache->pinned -= len;
  		cache->space_info->bytes_pinned -= len;
+ 		if (cache->ro) {
+ 			cache->space_info->bytes_readonly += len;
+ 		} else if (cache->reserved_pinned > 0) {
+ 			len = min(len, cache->reserved_pinned);
+ 			cache->reserved_pinned -= len;
+ 			cache->space_info->bytes_reserved += len;
+ 		}
  		spin_unlock(&cache->lock);
  		spin_unlock(&cache->space_info->lock);
- 
- 		start += len;
  	}
  
  	if (cache)
@@@ -3667,8 -4019,11 +4019,11 @@@ int btrfs_finish_extent_commit(struct b
  {
  	struct btrfs_fs_info *fs_info = root->fs_info;
  	struct extent_io_tree *unpin;
+ 	struct btrfs_block_rsv *block_rsv;
+ 	struct btrfs_block_rsv *next_rsv;
  	u64 start;
  	u64 end;
+ 	int idx;
  	int ret;
  
  	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@@ -3689,59 -4044,30 +4044,30 @@@
  		cond_resched();
  	}
  
- 	return ret;
- }
- 
- static int pin_down_bytes(struct btrfs_trans_handle *trans,
- 			  struct btrfs_root *root,
- 			  struct btrfs_path *path,
- 			  u64 bytenr, u64 num_bytes,
- 			  int is_data, int reserved,
- 			  struct extent_buffer **must_clean)
- {
- 	int err = 0;
- 	struct extent_buffer *buf;
- 
- 	if (is_data)
- 		goto pinit;
- 
- 	/*
- 	 * discard is sloooow, and so triggering discards on
- 	 * individual btree blocks isn't a good plan.  Just
- 	 * pin everything in discard mode.
- 	 */
- 	if (btrfs_test_opt(root, DISCARD))
- 		goto pinit;
+ 	mutex_lock(&fs_info->durable_block_rsv_mutex);
+ 	list_for_each_entry_safe(block_rsv, next_rsv,
+ 				 &fs_info->durable_block_rsv_list, list) {
  
- 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
- 	if (!buf)
- 		goto pinit;
+ 		idx = trans->transid & 0x1;
+ 		if (block_rsv->freed[idx] > 0) {
+ 			block_rsv_add_bytes(block_rsv,
+ 					    block_rsv->freed[idx], 0);
+ 			block_rsv->freed[idx] = 0;
+ 		}
+ 		if (atomic_read(&block_rsv->usage) == 0) {
+ 			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
  
- 	/* we can reuse a block if it hasn't been written
- 	 * and it is from this transaction.  We can't
- 	 * reuse anything from the tree log root because
- 	 * it has tiny sub-transactions.
- 	 */
- 	if (btrfs_buffer_uptodate(buf, 0) &&
- 	    btrfs_try_tree_lock(buf)) {
- 		u64 header_owner = btrfs_header_owner(buf);
- 		u64 header_transid = btrfs_header_generation(buf);
- 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
- 		    header_transid == trans->transid &&
- 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- 			*must_clean = buf;
- 			return 1;
+ 			if (block_rsv->freed[0] == 0 &&
+ 			    block_rsv->freed[1] == 0) {
+ 				list_del_init(&block_rsv->list);
+ 				kfree(block_rsv);
+ 			}
+ 		} else {
+ 			btrfs_block_rsv_release(root, block_rsv, 0);
  		}
- 		btrfs_tree_unlock(buf);
  	}
- 	free_extent_buffer(buf);
- pinit:
- 	if (path)
- 		btrfs_set_path_blocking(path);
- 	/* unlocks the pinned mutex */
- 	btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+ 	mutex_unlock(&fs_info->durable_block_rsv_mutex);
  
- 	BUG_ON(err < 0);
  	return 0;
  }
  
@@@ -3902,9 -4228,6 +4228,6 @@@ static int __btrfs_free_extent(struct b
  			BUG_ON(ret);
  		}
  	} else {
- 		int mark_free = 0;
- 		struct extent_buffer *must_clean = NULL;
- 
  		if (found_extent) {
  			BUG_ON(is_data && refs_to_drop !=
  			       extent_data_ref_count(root, path, iref));
@@@ -3917,31 -4240,11 +4240,11 @@@
  			}
  		}
  
- 		ret = pin_down_bytes(trans, root, path, bytenr,
- 				     num_bytes, is_data, 0, &must_clean);
- 		if (ret > 0)
- 			mark_free = 1;
- 		BUG_ON(ret < 0);
- 		/*
- 		 * it is going to be very rare for someone to be waiting
- 		 * on the block we're freeing.  del_items might need to
- 		 * schedule, so rather than get fancy, just force it
- 		 * to blocking here
- 		 */
- 		if (must_clean)
- 			btrfs_set_lock_blocking(must_clean);
- 
  		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
  				      num_to_del);
  		BUG_ON(ret);
  		btrfs_release_path(extent_root, path);
  
- 		if (must_clean) {
- 			clean_tree_block(NULL, root, must_clean);
- 			btrfs_tree_unlock(must_clean);
- 			free_extent_buffer(must_clean);
- 		}
- 
  		if (is_data) {
  			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
  			BUG_ON(ret);
@@@ -3951,8 -4254,7 +4254,7 @@@
  			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
  		}
  
- 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
- 					 mark_free);
+ 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
  		BUG_ON(ret);
  	}
  	btrfs_free_path(path);
@@@ -3960,7 -4262,7 +4262,7 @@@
  }
  
  /*
-  * when we free an extent, it is possible (and likely) that we free the last
+  * when we free an block, it is possible (and likely) that we free the last
   * delayed ref for that extent as well.  This searches the delayed ref tree for
   * a given extent, and if there are no other delayed refs to be processed, it
   * removes it from the tree.
@@@ -3972,7 -4274,7 +4274,7 @@@ static noinline int check_ref_cleanup(s
  	struct btrfs_delayed_ref_root *delayed_refs;
  	struct btrfs_delayed_ref_node *ref;
  	struct rb_node *node;
- 	int ret;
+ 	int ret = 0;
  
  	delayed_refs = &trans->transaction->delayed_refs;
  	spin_lock(&delayed_refs->lock);
@@@ -4024,17 -4326,99 +4326,99 @@@
  	list_del_init(&head->cluster);
  	spin_unlock(&delayed_refs->lock);
  
- 	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
- 				  &head->node, head->extent_op,
- 				  head->must_insert_reserved);
- 	BUG_ON(ret);
+ 	BUG_ON(head->extent_op);
+ 	if (head->must_insert_reserved)
+ 		ret = 1;
+ 
+ 	mutex_unlock(&head->mutex);
  	btrfs_put_delayed_ref(&head->node);
- 	return 0;
+ 	return ret;
  out:
  	spin_unlock(&delayed_refs->lock);
  	return 0;
  }
  
+ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ 			   struct btrfs_root *root,
+ 			   struct extent_buffer *buf,
+ 			   u64 parent, int last_ref)
+ {
+ 	struct btrfs_block_rsv *block_rsv;
+ 	struct btrfs_block_group_cache *cache = NULL;
+ 	int ret;
+ 
+ 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ 		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+ 						parent, root->root_key.objectid,
+ 						btrfs_header_level(buf),
+ 						BTRFS_DROP_DELAYED_REF, NULL);
+ 		BUG_ON(ret);
+ 	}
+ 
+ 	if (!last_ref)
+ 		return;
+ 
+ 	block_rsv = get_block_rsv(trans, root);
+ 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+ 	BUG_ON(block_rsv->space_info != cache->space_info);
+ 
+ 	if (btrfs_header_generation(buf) == trans->transid) {
+ 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ 			ret = check_ref_cleanup(trans, root, buf->start);
+ 			if (!ret)
+ 				goto pin;
+ 		}
+ 
+ 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ 			pin_down_extent(root, cache, buf->start, buf->len, 1);
+ 			goto pin;
+ 		}
+ 
+ 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+ 
+ 		btrfs_add_free_space(cache, buf->start, buf->len);
+ 		ret = update_reserved_bytes(cache, buf->len, 0, 0);
+ 		if (ret == -EAGAIN) {
+ 			/* block group became read-only */
+ 			update_reserved_bytes(cache, buf->len, 0, 1);
+ 			goto out;
+ 		}
+ 
+ 		ret = 1;
+ 		spin_lock(&block_rsv->lock);
+ 		if (block_rsv->reserved < block_rsv->size) {
+ 			block_rsv->reserved += buf->len;
+ 			ret = 0;
+ 		}
+ 		spin_unlock(&block_rsv->lock);
+ 
+ 		if (ret) {
+ 			spin_lock(&cache->space_info->lock);
+ 			cache->space_info->bytes_reserved -= buf->len;
+ 			spin_unlock(&cache->space_info->lock);
+ 		}
+ 		goto out;
+ 	}
+ pin:
+ 	if (block_rsv->durable && !cache->ro) {
+ 		ret = 0;
+ 		spin_lock(&cache->lock);
+ 		if (!cache->ro) {
+ 			cache->reserved_pinned += buf->len;
+ 			ret = 1;
+ 		}
+ 		spin_unlock(&cache->lock);
+ 
+ 		if (ret) {
+ 			spin_lock(&block_rsv->lock);
+ 			block_rsv->freed[trans->transid & 0x1] += buf->len;
+ 			spin_unlock(&block_rsv->lock);
+ 		}
+ 	}
+ out:
+ 	btrfs_put_block_group(cache);
+ }
+ 
  int btrfs_free_extent(struct btrfs_trans_handle *trans,
  		      struct btrfs_root *root,
  		      u64 bytenr, u64 num_bytes, u64 parent,
@@@ -4056,8 -4440,6 +4440,6 @@@
  					parent, root_objectid, (int)owner,
  					BTRFS_DROP_DELAYED_REF, NULL);
  		BUG_ON(ret);
- 		ret = check_ref_cleanup(trans, root, bytenr);
- 		BUG_ON(ret);
  	} else {
  		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
  					parent, root_objectid, owner,
@@@ -4067,21 -4449,6 +4449,6 @@@
  	return ret;
  }
  
- int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- 			  struct btrfs_root *root,
- 			  u64 bytenr, u32 blocksize,
- 			  u64 parent, u64 root_objectid, int level)
- {
- 	u64 used;
- 	spin_lock(&root->node_lock);
- 	used = btrfs_root_used(&root->root_item) - blocksize;
- 	btrfs_set_root_used(&root->root_item, used);
- 	spin_unlock(&root->node_lock);
- 
- 	return btrfs_free_extent(trans, root, bytenr, blocksize,
- 				 parent, root_objectid, level, 0);
- }
- 
  static u64 stripe_align(struct btrfs_root *root, u64 val)
  {
  	u64 mask = ((u64)root->stripesize - 1);
@@@ -4134,6 -4501,22 +4501,22 @@@ wait_block_group_cache_done(struct btrf
  	return 0;
  }
  
+ static int get_block_group_index(struct btrfs_block_group_cache *cache)
+ {
+ 	int index;
+ 	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+ 		index = 0;
+ 	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+ 		index = 1;
+ 	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+ 		index = 2;
+ 	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+ 		index = 3;
+ 	else
+ 		index = 4;
+ 	return index;
+ }
+ 
  enum btrfs_loop_type {
  	LOOP_FIND_IDEAL = 0,
  	LOOP_CACHING_NOWAIT = 1,
@@@ -4155,7 -4538,6 +4538,6 @@@ static noinline int find_free_extent(st
  				     u64 num_bytes, u64 empty_size,
  				     u64 search_start, u64 search_end,
  				     u64 hint_byte, struct btrfs_key *ins,
- 				     u64 exclude_start, u64 exclude_nr,
  				     int data)
  {
  	int ret = 0;
@@@ -4168,6 -4550,7 +4550,7 @@@
  	struct btrfs_space_info *space_info;
  	int last_ptr_loop = 0;
  	int loop = 0;
+ 	int index = 0;
  	bool found_uncached_bg = false;
  	bool failed_cluster_refill = false;
  	bool failed_alloc = false;
@@@ -4237,6 -4620,7 +4620,7 @@@ ideal_cache
  				btrfs_put_block_group(block_group);
  				up_read(&space_info->groups_sem);
  			} else {
+ 				index = get_block_group_index(block_group);
  				goto have_block_group;
  			}
  		} else if (block_group) {
@@@ -4245,7 -4629,8 +4629,8 @@@
  	}
  search:
  	down_read(&space_info->groups_sem);
- 	list_for_each_entry(block_group, &space_info->block_groups, list) {
+ 	list_for_each_entry(block_group, &space_info->block_groups[index],
+ 			    list) {
  		u64 offset;
  		int cached;
  
@@@ -4436,23 -4821,22 +4821,22 @@@ checks
  			goto loop;
  		}
  
- 		if (exclude_nr > 0 &&
- 		    (search_start + num_bytes > exclude_start &&
- 		     search_start < exclude_start + exclude_nr)) {
- 			search_start = exclude_start + exclude_nr;
+ 		ins->objectid = search_start;
+ 		ins->offset = num_bytes;
+ 
+ 		if (offset < search_start)
+ 			btrfs_add_free_space(block_group, offset,
+ 					     search_start - offset);
+ 		BUG_ON(offset > search_start);
  
+ 		ret = update_reserved_bytes(block_group, num_bytes, 1,
+ 					    (data & BTRFS_BLOCK_GROUP_DATA));
+ 		if (ret == -EAGAIN) {
  			btrfs_add_free_space(block_group, offset, num_bytes);
- 			/*
- 			 * if search_start is still in this block group
- 			 * then we just re-search this block group
- 			 */
- 			if (search_start >= block_group->key.objectid &&
- 			    search_start < (block_group->key.objectid +
- 					    block_group->key.offset))
- 				goto have_block_group;
  			goto loop;
  		}
  
+ 		/* we are all good, lets return */
  		ins->objectid = search_start;
  		ins->offset = num_bytes;
  
@@@ -4460,18 -4844,18 +4844,18 @@@
  			btrfs_add_free_space(block_group, offset,
  					     search_start - offset);
  		BUG_ON(offset > search_start);
- 
- 		update_reserved_extents(block_group, num_bytes, 1);
- 
- 		/* we are all good, lets return */
  		break;
  loop:
  		failed_cluster_refill = false;
  		failed_alloc = false;
+ 		BUG_ON(index != get_block_group_index(block_group));
  		btrfs_put_block_group(block_group);
  	}
  	up_read(&space_info->groups_sem);
  
+ 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+ 		goto search;
+ 
  	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
  	 *			for them to make caching progress.  Also
  	 *			determine the best possible bg to cache
@@@ -4485,6 -4869,7 +4869,7 @@@
  	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
  	    (found_uncached_bg || empty_size || empty_cluster ||
  	     allowed_chunk_alloc)) {
+ 		index = 0;
  		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
  			found_uncached_bg = false;
  			loop++;
@@@ -4567,31 -4952,30 +4952,30 @@@ static void dump_space_info(struct btrf
  			    int dump_block_groups)
  {
  	struct btrfs_block_group_cache *cache;
+ 	int index = 0;
  
  	spin_lock(&info->lock);
  	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
  	       (unsigned long long)(info->total_bytes - info->bytes_used -
  				    info->bytes_pinned - info->bytes_reserved -
- 				    info->bytes_super),
+ 				    info->bytes_readonly),
  	       (info->full) ? "" : "not ");
- 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- 	       " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
- 	       "\n",
+ 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+ 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
  	       (unsigned long long)info->total_bytes,
+ 	       (unsigned long long)info->bytes_used,
  	       (unsigned long long)info->bytes_pinned,
- 	       (unsigned long long)info->bytes_delalloc,
+ 	       (unsigned long long)info->bytes_reserved,
  	       (unsigned long long)info->bytes_may_use,
- 	       (unsigned long long)info->bytes_used,
- 	       (unsigned long long)info->bytes_root,
- 	       (unsigned long long)info->bytes_super,
- 	       (unsigned long long)info->bytes_reserved);
+ 	       (unsigned long long)info->bytes_readonly);
  	spin_unlock(&info->lock);
  
  	if (!dump_block_groups)
  		return;
  
  	down_read(&info->groups_sem);
- 	list_for_each_entry(cache, &info->block_groups, list) {
+ again:
+ 	list_for_each_entry(cache, &info->block_groups[index], list) {
  		spin_lock(&cache->lock);
  		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
  		       "%llu pinned %llu reserved\n",
@@@ -4603,6 -4987,8 +4987,8 @@@
  		btrfs_dump_free_space(cache, bytes);
  		spin_unlock(&cache->lock);
  	}
+ 	if (++index < BTRFS_NR_RAID_TYPES)
+ 		goto again;
  	up_read(&info->groups_sem);
  }
  
@@@ -4628,9 -5014,8 +5014,8 @@@ again
  
  	WARN_ON(num_bytes < root->sectorsize);
  	ret = find_free_extent(trans, root, num_bytes, empty_size,
- 			       search_start, search_end, hint_byte, ins,
- 			       trans->alloc_exclude_start,
- 			       trans->alloc_exclude_nr, data);
+ 			       search_start, search_end, hint_byte,
+ 			       ins, data);
  
  	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
  		num_bytes = num_bytes >> 1;
@@@ -4668,7 -5053,7 +5053,7 @@@ int btrfs_free_reserved_extent(struct b
  	ret = btrfs_discard_extent(root, start, len);
  
  	btrfs_add_free_space(cache, start, len);
- 	update_reserved_extents(cache, len, 0);
+ 	update_reserved_bytes(cache, len, 0, 1);
  	btrfs_put_block_group(cache);
  
  	return ret;
@@@ -4731,8 -5116,7 +5116,7 @@@ static int alloc_reserved_file_extent(s
  	btrfs_mark_buffer_dirty(path->nodes[0]);
  	btrfs_free_path(path);
  
- 	ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 				 1, 0);
+ 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
  	if (ret) {
  		printk(KERN_ERR "btrfs update block group failed for %llu "
  		       "%llu\n", (unsigned long long)ins->objectid,
@@@ -4792,8 -5176,7 +5176,7 @@@ static int alloc_reserved_tree_block(st
  	btrfs_mark_buffer_dirty(leaf);
  	btrfs_free_path(path);
  
- 	ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 				 1, 0);
+ 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
  	if (ret) {
  		printk(KERN_ERR "btrfs update block group failed for %llu "
  		       "%llu\n", (unsigned long long)ins->objectid,
@@@ -4869,73 -5252,14 +5252,14 @@@ int btrfs_alloc_logged_file_extent(stru
  		put_caching_control(caching_ctl);
  	}
  
- 	update_reserved_extents(block_group, ins->offset, 1);
+ 	ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+ 	BUG_ON(ret);
  	btrfs_put_block_group(block_group);
  	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
  					 0, owner, offset, ins, 1);
  	return ret;
  }
  
- /*
-  * finds a free extent and does all the dirty work required for allocation
-  * returns the key for the extent through ins, and a tree buffer for
-  * the first block of the extent through buf.
-  *
-  * returns 0 if everything worked, non-zero otherwise.
-  */
- static int alloc_tree_block(struct btrfs_trans_handle *trans,
- 			    struct btrfs_root *root,
- 			    u64 num_bytes, u64 parent, u64 root_objectid,
- 			    struct btrfs_disk_key *key, int level,
- 			    u64 empty_size, u64 hint_byte, u64 search_end,
- 			    struct btrfs_key *ins)
- {
- 	int ret;
- 	u64 flags = 0;
- 
- 	ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
- 				   empty_size, hint_byte, search_end,
- 				   ins, 0);
- 	if (ret)
- 		return ret;
- 
- 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
- 		if (parent == 0)
- 			parent = ins->objectid;
- 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
- 	} else
- 		BUG_ON(parent > 0);
- 
- 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- 		struct btrfs_delayed_extent_op *extent_op;
- 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- 		BUG_ON(!extent_op);
- 		if (key)
- 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
- 		else
- 			memset(&extent_op->key, 0, sizeof(extent_op->key));
- 		extent_op->flags_to_set = flags;
- 		extent_op->update_key = 1;
- 		extent_op->update_flags = 1;
- 		extent_op->is_data = 0;
- 
- 		ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
- 					ins->offset, parent, root_objectid,
- 					level, BTRFS_ADD_DELAYED_EXTENT,
- 					extent_op);
- 		BUG_ON(ret);
- 	}
- 
- 	if (root_objectid == root->root_key.objectid) {
- 		u64 used;
- 		spin_lock(&root->node_lock);
- 		used = btrfs_root_used(&root->root_item) + num_bytes;
- 		btrfs_set_root_used(&root->root_item, used);
- 		spin_unlock(&root->node_lock);
- 	}
- 	return ret;
- }
- 
  struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
  					    struct btrfs_root *root,
  					    u64 bytenr, u32 blocksize,
@@@ -4974,8 -5298,45 +5298,45 @@@
  	return buf;
  }
  
+ static struct btrfs_block_rsv *
+ use_block_rsv(struct btrfs_trans_handle *trans,
+ 	      struct btrfs_root *root, u32 blocksize)
+ {
+ 	struct btrfs_block_rsv *block_rsv;
+ 	int ret;
+ 
+ 	block_rsv = get_block_rsv(trans, root);
+ 
+ 	if (block_rsv->size == 0) {
+ 		ret = reserve_metadata_bytes(block_rsv, blocksize);
+ 		if (ret)
+ 			return ERR_PTR(ret);
+ 		return block_rsv;
+ 	}
+ 
+ 	ret = block_rsv_use_bytes(block_rsv, blocksize);
+ 	if (!ret)
+ 		return block_rsv;
+ 
+ 	WARN_ON(1);
+ 	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ 		block_rsv->size, block_rsv->reserved,
+ 		block_rsv->freed[0], block_rsv->freed[1]);
+ 
+ 	return ERR_PTR(-ENOSPC);
+ }
+ 
+ static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+ {
+ 	block_rsv_add_bytes(block_rsv, blocksize, 0);
+ 	block_rsv_release_bytes(block_rsv, NULL, 0);
+ }
+ 
  /*
-  * helper function to allocate a block for a given tree
+  * finds a free extent and does all the dirty work required for allocation
+  * returns the key for the extent through ins, and a tree buffer for
+  * the first block of the extent through buf.
+  *
   * returns the tree buffer or NULL.
   */
  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@@ -4985,18 -5346,53 +5346,53 @@@
  					u64 hint, u64 empty_size)
  {
  	struct btrfs_key ins;
- 	int ret;
+ 	struct btrfs_block_rsv *block_rsv;
  	struct extent_buffer *buf;
+ 	u64 flags = 0;
+ 	int ret;
+ 
  
- 	ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
- 			       key, level, empty_size, hint, (u64)-1, &ins);
+ 	block_rsv = use_block_rsv(trans, root, blocksize);
+ 	if (IS_ERR(block_rsv))
+ 		return ERR_CAST(block_rsv);
+ 
+ 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+ 				   empty_size, hint, (u64)-1, &ins, 0);
  	if (ret) {
- 		BUG_ON(ret > 0);
+ 		unuse_block_rsv(block_rsv, blocksize);
  		return ERR_PTR(ret);
  	}
  
  	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
  				    blocksize, level);
+ 	BUG_ON(IS_ERR(buf));
+ 
+ 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ 		if (parent == 0)
+ 			parent = ins.objectid;
+ 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ 	} else
+ 		BUG_ON(parent > 0);
+ 
+ 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ 		struct btrfs_delayed_extent_op *extent_op;
+ 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ 		BUG_ON(!extent_op);
+ 		if (key)
+ 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ 		else
+ 			memset(&extent_op->key, 0, sizeof(extent_op->key));
+ 		extent_op->flags_to_set = flags;
+ 		extent_op->update_key = 1;
+ 		extent_op->update_flags = 1;
+ 		extent_op->is_data = 0;
+ 
+ 		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+ 					ins.offset, parent, root_objectid,
+ 					level, BTRFS_ADD_DELAYED_EXTENT,
+ 					extent_op);
+ 		BUG_ON(ret);
+ 	}
  	return buf;
  }
  
@@@ -5321,7 -5717,7 +5717,7 @@@ static noinline int walk_up_proc(struc
  				 struct btrfs_path *path,
  				 struct walk_control *wc)
  {
- 	int ret = 0;
+ 	int ret;
  	int level = wc->level;
  	struct extent_buffer *eb = path->nodes[level];
  	u64 parent = 0;
@@@ -5399,13 -5795,11 +5795,11 @@@
  			       btrfs_header_owner(path->nodes[level + 1]));
  	}
  
- 	ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
- 				root->root_key.objectid, level, 0);
- 	BUG_ON(ret);
+ 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
  out:
  	wc->refs[level] = 0;
  	wc->flags[level] = 0;
- 	return ret;
+ 	return 0;
  }
  
  static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@@ -5483,7 -5877,8 +5877,8 @@@ static noinline int walk_up_tree(struc
   * also make sure backrefs for the shared block and all lower level
   * blocks are properly updated.
   */
- int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+ int btrfs_drop_snapshot(struct btrfs_root *root,
+ 			struct btrfs_block_rsv *block_rsv, int update_ref)
  {
  	struct btrfs_path *path;
  	struct btrfs_trans_handle *trans;
@@@ -5501,7 -5896,9 +5896,9 @@@
  	wc = kzalloc(sizeof(*wc), GFP_NOFS);
  	BUG_ON(!wc);
  
- 	trans = btrfs_start_transaction(tree_root, 1);
+ 	trans = btrfs_start_transaction(tree_root, 0);
+ 	if (block_rsv)
+ 		trans->block_rsv = block_rsv;
  
  	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
  		level = btrfs_header_level(root->node);
@@@ -5589,22 -5986,16 +5986,16 @@@
  		}
  
  		BUG_ON(wc->level == 0);
- 		if (trans->transaction->in_commit ||
- 		    trans->transaction->delayed_refs.flushing) {
+ 		if (btrfs_should_end_transaction(trans, tree_root)) {
  			ret = btrfs_update_root(trans, tree_root,
  						&root->root_key,
  						root_item);
  			BUG_ON(ret);
  
- 			btrfs_end_transaction(trans, tree_root);
- 			trans = btrfs_start_transaction(tree_root, 1);
- 		} else {
- 			unsigned long update;
- 			update = trans->delayed_ref_updates;
- 			trans->delayed_ref_updates = 0;
- 			if (update)
- 				btrfs_run_delayed_refs(trans, tree_root,
- 						       update);
+ 			btrfs_end_transaction_throttle(trans, tree_root);
+ 			trans = btrfs_start_transaction(tree_root, 0);
+ 			if (block_rsv)
+ 				trans->block_rsv = block_rsv;
  		}
  	}
  	btrfs_release_path(root, path);
@@@ -5632,7 -6023,7 +6023,7 @@@
  		kfree(root);
  	}
  out:
- 	btrfs_end_transaction(trans, tree_root);
+ 	btrfs_end_transaction_throttle(trans, tree_root);
  	kfree(wc);
  	btrfs_free_path(path);
  	return err;
@@@ -7228,48 -7619,80 +7619,80 @@@ static u64 update_block_group_flags(str
  	return flags;
  }
  
- static int __alloc_chunk_for_shrink(struct btrfs_root *root,
- 		     struct btrfs_block_group_cache *shrink_block_group,
- 		     int force)
+ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
  {
- 	struct btrfs_trans_handle *trans;
- 	u64 new_alloc_flags;
- 	u64 calc;
+ 	struct btrfs_space_info *sinfo = cache->space_info;
+ 	u64 num_bytes;
+ 	int ret = -ENOSPC;
  
- 	spin_lock(&shrink_block_group->lock);
- 	if (btrfs_block_group_used(&shrink_block_group->item) +
- 	    shrink_block_group->reserved > 0) {
- 		spin_unlock(&shrink_block_group->lock);
+ 	if (cache->ro)
+ 		return 0;
  
- 		trans = btrfs_start_transaction(root, 1);
- 		spin_lock(&shrink_block_group->lock);
+ 	spin_lock(&sinfo->lock);
+ 	spin_lock(&cache->lock);
+ 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+ 
+ 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+ 	    sinfo->bytes_may_use + sinfo->bytes_readonly +
+ 	    cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+ 		sinfo->bytes_readonly += num_bytes;
+ 		sinfo->bytes_reserved += cache->reserved_pinned;
+ 		cache->reserved_pinned = 0;
+ 		cache->ro = 1;
+ 		ret = 0;
+ 	}
+ 	spin_unlock(&cache->lock);
+ 	spin_unlock(&sinfo->lock);
+ 	return ret;
+ }
  
- 		new_alloc_flags = update_block_group_flags(root,
- 						   shrink_block_group->flags);
- 		if (new_alloc_flags != shrink_block_group->flags) {
- 			calc =
- 			     btrfs_block_group_used(&shrink_block_group->item);
- 		} else {
- 			calc = shrink_block_group->key.offset;
- 		}
- 		spin_unlock(&shrink_block_group->lock);
+ int btrfs_set_block_group_ro(struct btrfs_root *root,
+ 			     struct btrfs_block_group_cache *cache)
  
- 		do_chunk_alloc(trans, root->fs_info->extent_root,
- 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
+ {
+ 	struct btrfs_trans_handle *trans;
+ 	u64 alloc_flags;
+ 	int ret;
  
- 		btrfs_end_transaction(trans, root);
- 	} else
- 		spin_unlock(&shrink_block_group->lock);
- 	return 0;
- }
+ 	BUG_ON(cache->ro);
  
+ 	trans = btrfs_join_transaction(root, 1);
+ 	BUG_ON(IS_ERR(trans));
  
- int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
- 					 struct btrfs_block_group_cache *group)
+ 	alloc_flags = update_block_group_flags(root, cache->flags);
+ 	if (alloc_flags != cache->flags)
+ 		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+ 
+ 	ret = set_block_group_ro(cache);
+ 	if (!ret)
+ 		goto out;
+ 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+ 	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+ 	if (ret < 0)
+ 		goto out;
+ 	ret = set_block_group_ro(cache);
+ out:
+ 	btrfs_end_transaction(trans, root);
+ 	return ret;
+ }
  
+ int btrfs_set_block_group_rw(struct btrfs_root *root,
+ 			      struct btrfs_block_group_cache *cache)
  {
- 	__alloc_chunk_for_shrink(root, group, 1);
- 	set_block_group_readonly(group);
+ 	struct btrfs_space_info *sinfo = cache->space_info;
+ 	u64 num_bytes;
+ 
+ 	BUG_ON(!cache->ro);
+ 
+ 	spin_lock(&sinfo->lock);
+ 	spin_lock(&cache->lock);
+ 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+ 	sinfo->bytes_readonly -= num_bytes;
+ 	cache->ro = 0;
+ 	spin_unlock(&cache->lock);
+ 	spin_unlock(&sinfo->lock);
  	return 0;
  }
  
@@@ -7436,17 -7859,33 +7859,33 @@@ int btrfs_free_block_groups(struct btrf
  	 */
  	synchronize_rcu();
  
+ 	release_global_block_rsv(info);
+ 
  	while(!list_empty(&info->space_info)) {
  		space_info = list_entry(info->space_info.next,
  					struct btrfs_space_info,
  					list);
- 
+ 		if (space_info->bytes_pinned > 0 ||
+ 		    space_info->bytes_reserved > 0) {
+ 			WARN_ON(1);
+ 			dump_space_info(space_info, 0, 0);
+ 		}
  		list_del(&space_info->list);
  		kfree(space_info);
  	}
  	return 0;
  }
  
+ static void __link_block_group(struct btrfs_space_info *space_info,
+ 			       struct btrfs_block_group_cache *cache)
+ {
+ 	int index = get_block_group_index(cache);
+ 
+ 	down_write(&space_info->groups_sem);
+ 	list_add_tail(&cache->list, &space_info->block_groups[index]);
+ 	up_write(&space_info->groups_sem);
+ }
+ 
  int btrfs_read_block_groups(struct btrfs_root *root)
  {
  	struct btrfs_path *path;
@@@ -7468,10 -7907,8 +7907,8 @@@
  
  	while (1) {
  		ret = find_first_block_group(root, path, &key);
- 		if (ret > 0) {
- 			ret = 0;
- 			goto error;
- 		}
+ 		if (ret > 0)
+ 			break;
  		if (ret != 0)
  			goto error;
  
@@@ -7480,7 -7917,7 +7917,7 @@@
  		cache = kzalloc(sizeof(*cache), GFP_NOFS);
  		if (!cache) {
  			ret = -ENOMEM;
- 			break;
+ 			goto error;
  		}
  
  		atomic_set(&cache->count, 1);
@@@ -7537,20 -7974,36 +7974,36 @@@
  		BUG_ON(ret);
  		cache->space_info = space_info;
  		spin_lock(&cache->space_info->lock);
- 		cache->space_info->bytes_super += cache->bytes_super;
+ 		cache->space_info->bytes_readonly += cache->bytes_super;
  		spin_unlock(&cache->space_info->lock);
  
- 		down_write(&space_info->groups_sem);
- 		list_add_tail(&cache->list, &space_info->block_groups);
- 		up_write(&space_info->groups_sem);
+ 		__link_block_group(space_info, cache);
  
  		ret = btrfs_add_block_group_cache(root->fs_info, cache);
  		BUG_ON(ret);
  
  		set_avail_alloc_bits(root->fs_info, cache->flags);
  		if (btrfs_chunk_readonly(root, cache->key.objectid))
- 			set_block_group_readonly(cache);
+ 			set_block_group_ro(cache);
+ 	}
+ 
+ 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+ 		if (!(get_alloc_profile(root, space_info->flags) &
+ 		      (BTRFS_BLOCK_GROUP_RAID10 |
+ 		       BTRFS_BLOCK_GROUP_RAID1 |
+ 		       BTRFS_BLOCK_GROUP_DUP)))
+ 			continue;
+ 		/*
+ 		 * avoid allocating from un-mirrored block group if there are
+ 		 * mirrored block groups.
+ 		 */
+ 		list_for_each_entry(cache, &space_info->block_groups[3], list)
+ 			set_block_group_ro(cache);
+ 		list_for_each_entry(cache, &space_info->block_groups[4], list)
+ 			set_block_group_ro(cache);
  	}
+ 
+ 	init_global_block_rsv(info);
  	ret = 0;
  error:
  	btrfs_free_path(path);
@@@ -7611,12 -8064,10 +8064,10 @@@ int btrfs_make_block_group(struct btrfs
  	BUG_ON(ret);
  
  	spin_lock(&cache->space_info->lock);
- 	cache->space_info->bytes_super += cache->bytes_super;
+ 	cache->space_info->bytes_readonly += cache->bytes_super;
  	spin_unlock(&cache->space_info->lock);
  
- 	down_write(&cache->space_info->groups_sem);
- 	list_add_tail(&cache->list, &cache->space_info->block_groups);
- 	up_write(&cache->space_info->groups_sem);
+ 	__link_block_group(cache->space_info, cache);
  
  	ret = btrfs_add_block_group_cache(root->fs_info, cache);
  	BUG_ON(ret);
diff --combined fs/btrfs/inode.c
index d601629b85d1,2551b8018399..fa6ccc1bfe2a
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -252,6 -252,7 +252,7 @@@ static noinline int cow_file_range_inli
  				   inline_len, compressed_size,
  				   compressed_pages);
  	BUG_ON(ret);
+ 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
  	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
  	return 0;
  }
@@@ -414,6 -415,7 +415,7 @@@ again
  		trans = btrfs_join_transaction(root, 1);
  		BUG_ON(!trans);
  		btrfs_set_trans_block_group(trans, inode);
+ 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
  		/* lets try to make an inline extent */
  		if (ret || total_in < (actual_end - start)) {
@@@ -439,7 -441,6 +441,6 @@@
  			     start, end, NULL,
  			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
  			     EXTENT_CLEAR_DELALLOC |
- 			     EXTENT_CLEAR_ACCOUNTING |
  			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
  
  			btrfs_end_transaction(trans, root);
@@@ -697,6 -698,38 +698,38 @@@ retry
  	return 0;
  }
  
+ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+ 				      u64 num_bytes)
+ {
+ 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ 	struct extent_map *em;
+ 	u64 alloc_hint = 0;
+ 
+ 	read_lock(&em_tree->lock);
+ 	em = search_extent_mapping(em_tree, start, num_bytes);
+ 	if (em) {
+ 		/*
+ 		 * if block start isn't an actual block number then find the
+ 		 * first block in this inode and use that as a hint.  If that
+ 		 * block is also bogus then just don't worry about it.
+ 		 */
+ 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ 			free_extent_map(em);
+ 			em = search_extent_mapping(em_tree, 0, 0);
+ 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ 				alloc_hint = em->block_start;
+ 			if (em)
+ 				free_extent_map(em);
+ 		} else {
+ 			alloc_hint = em->block_start;
+ 			free_extent_map(em);
+ 		}
+ 	}
+ 	read_unlock(&em_tree->lock);
+ 
+ 	return alloc_hint;
+ }
+ 
  /*
   * when extent_io.c finds a delayed allocation range in the file,
   * the call backs end up in this code.  The basic idea is to
@@@ -734,6 -767,7 +767,7 @@@ static noinline int cow_file_range(stru
  	trans = btrfs_join_transaction(root, 1);
  	BUG_ON(!trans);
  	btrfs_set_trans_block_group(trans, inode);
+ 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
  	actual_end = min_t(u64, isize, end + 1);
  
@@@ -753,7 -787,6 +787,6 @@@
  				     EXTENT_CLEAR_UNLOCK_PAGE |
  				     EXTENT_CLEAR_UNLOCK |
  				     EXTENT_CLEAR_DELALLOC |
- 				     EXTENT_CLEAR_ACCOUNTING |
  				     EXTENT_CLEAR_DIRTY |
  				     EXTENT_SET_WRITEBACK |
  				     EXTENT_END_WRITEBACK);
@@@ -769,29 -802,7 +802,7 @@@
  	BUG_ON(disk_num_bytes >
  	       btrfs_super_total_bytes(&root->fs_info->super_copy));
  
- 
- 	read_lock(&BTRFS_I(inode)->extent_tree.lock);
- 	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
- 				   start, num_bytes);
- 	if (em) {
- 		/*
- 		 * if block start isn't an actual block number then find the
- 		 * first block in this inode and use that as a hint.  If that
- 		 * block is also bogus then just don't worry about it.
- 		 */
- 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- 			free_extent_map(em);
- 			em = search_extent_mapping(em_tree, 0, 0);
- 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- 				alloc_hint = em->block_start;
- 			if (em)
- 				free_extent_map(em);
- 		} else {
- 			alloc_hint = em->block_start;
- 			free_extent_map(em);
- 		}
- 	}
- 	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
  	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
  
  	while (disk_num_bytes > 0) {
@@@ -1174,6 -1185,13 +1185,13 @@@ out_check
  					       num_bytes, num_bytes, type);
  		BUG_ON(ret);
  
+ 		if (root->root_key.objectid ==
+ 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
+ 						      num_bytes);
+ 			BUG_ON(ret);
+ 		}
+ 
  		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
  				cur_offset, cur_offset + num_bytes - 1,
  				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@@ -1226,15 -1244,13 +1244,13 @@@ static int run_delalloc_range(struct in
  }
  
  static int btrfs_split_extent_hook(struct inode *inode,
- 				    struct extent_state *orig, u64 split)
+ 				   struct extent_state *orig, u64 split)
  {
+ 	/* not delalloc, ignore it */
  	if (!(orig->state & EXTENT_DELALLOC))
  		return 0;
  
- 	spin_lock(&BTRFS_I(inode)->accounting_lock);
- 	BTRFS_I(inode)->outstanding_extents++;
- 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
- 
+ 	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
  	return 0;
  }
  
@@@ -1252,10 -1268,7 +1268,7 @@@ static int btrfs_merge_extent_hook(stru
  	if (!(other->state & EXTENT_DELALLOC))
  		return 0;
  
- 	spin_lock(&BTRFS_I(inode)->accounting_lock);
- 	BTRFS_I(inode)->outstanding_extents--;
- 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
- 
+ 	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
  	return 0;
  }
  
@@@ -1264,8 -1277,8 +1277,8 @@@
   * bytes in this file, and to maintain the list of inodes that
   * have pending delalloc work to be done.
   */
- static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
- 		       unsigned long old, unsigned long bits)
+ static int btrfs_set_bit_hook(struct inode *inode,
+ 			      struct extent_state *state, int *bits)
  {
  
  	/*
@@@ -1273,17 -1286,18 +1286,18 @@@
  	 * but in this case, we are only testeing for the DELALLOC
  	 * bit, which is only set or cleared with irqs on
  	 */
- 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
  		struct btrfs_root *root = BTRFS_I(inode)->root;
+ 		u64 len = state->end + 1 - state->start;
  
- 		spin_lock(&BTRFS_I(inode)->accounting_lock);
- 		BTRFS_I(inode)->outstanding_extents++;
- 		spin_unlock(&BTRFS_I(inode)->accounting_lock);
- 		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ 		if (*bits & EXTENT_FIRST_DELALLOC)
+ 			*bits &= ~EXTENT_FIRST_DELALLOC;
+ 		else
+ 			atomic_inc(&BTRFS_I(inode)->outstanding_extents);
  
  		spin_lock(&root->fs_info->delalloc_lock);
- 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
- 		root->fs_info->delalloc_bytes += end - start + 1;
+ 		BTRFS_I(inode)->delalloc_bytes += len;
+ 		root->fs_info->delalloc_bytes += len;
  		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
  			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
  				      &root->fs_info->delalloc_inodes);
@@@ -1297,45 -1311,32 +1311,32 @@@
   * extent_io.c clear_bit_hook, see set_bit_hook for why
   */
  static int btrfs_clear_bit_hook(struct inode *inode,
- 				struct extent_state *state, unsigned long bits)
+ 				struct extent_state *state, int *bits)
  {
  	/*
  	 * set_bit and clear bit hooks normally require _irqsave/restore
  	 * but in this case, we are only testeing for the DELALLOC
  	 * bit, which is only set or cleared with irqs on
  	 */
- 	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
  		struct btrfs_root *root = BTRFS_I(inode)->root;
+ 		u64 len = state->end + 1 - state->start;
  
- 		if (bits & EXTENT_DO_ACCOUNTING) {
- 			spin_lock(&BTRFS_I(inode)->accounting_lock);
- 			WARN_ON(!BTRFS_I(inode)->outstanding_extents);
- 			BTRFS_I(inode)->outstanding_extents--;
- 			spin_unlock(&BTRFS_I(inode)->accounting_lock);
- 			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
- 		}
+ 		if (*bits & EXTENT_FIRST_DELALLOC)
+ 			*bits &= ~EXTENT_FIRST_DELALLOC;
+ 		else if (!(*bits & EXTENT_DO_ACCOUNTING))
+ 			atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+ 
+ 		if (*bits & EXTENT_DO_ACCOUNTING)
+ 			btrfs_delalloc_release_metadata(inode, len);
+ 
+ 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ 			btrfs_free_reserved_data_space(inode, len);
  
  		spin_lock(&root->fs_info->delalloc_lock);
- 		if (state->end - state->start + 1 >
- 		    root->fs_info->delalloc_bytes) {
- 			printk(KERN_INFO "btrfs warning: delalloc account "
- 			       "%llu %llu\n",
- 			       (unsigned long long)
- 			       state->end - state->start + 1,
- 			       (unsigned long long)
- 			       root->fs_info->delalloc_bytes);
- 			btrfs_delalloc_free_space(root, inode, (u64)-1);
- 			root->fs_info->delalloc_bytes = 0;
- 			BTRFS_I(inode)->delalloc_bytes = 0;
- 		} else {
- 			btrfs_delalloc_free_space(root, inode,
- 						  state->end -
- 						  state->start + 1);
- 			root->fs_info->delalloc_bytes -= state->end -
- 				state->start + 1;
- 			BTRFS_I(inode)->delalloc_bytes -= state->end -
- 				state->start + 1;
- 		}
+ 		root->fs_info->delalloc_bytes -= len;
+ 		BTRFS_I(inode)->delalloc_bytes -= len;
+ 
  		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
  		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
  			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@@ -1384,7 -1385,8 +1385,8 @@@ int btrfs_merge_bio_hook(struct page *p
   */
  static int __btrfs_submit_bio_start(struct inode *inode, int rw,
  				    struct bio *bio, int mirror_num,
- 				    unsigned long bio_flags)
+ 				    unsigned long bio_flags,
+ 				    u64 bio_offset)
  {
  	struct btrfs_root *root = BTRFS_I(inode)->root;
  	int ret = 0;
@@@ -1403,7 -1405,8 +1405,8 @@@
   * are inserted into the btree
   */
  static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
- 			  int mirror_num, unsigned long bio_flags)
+ 			  int mirror_num, unsigned long bio_flags,
+ 			  u64 bio_offset)
  {
  	struct btrfs_root *root = BTRFS_I(inode)->root;
  	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@@ -1414,7 -1417,8 +1417,8 @@@
   * on write, or reading the csums from the tree before a read
   */
  static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- 			  int mirror_num, unsigned long bio_flags)
+ 			  int mirror_num, unsigned long bio_flags,
+ 			  u64 bio_offset)
  {
  	struct btrfs_root *root = BTRFS_I(inode)->root;
  	int ret = 0;
@@@ -1439,7 -1443,8 +1443,8 @@@
  		/* we're doing a write, do the async checksumming */
  		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
  				   inode, rw, bio, mirror_num,
- 				   bio_flags, __btrfs_submit_bio_start,
+ 				   bio_flags, bio_offset,
+ 				   __btrfs_submit_bio_start,
  				   __btrfs_submit_bio_done);
  	}
  
@@@ -1520,6 -1525,7 +1525,7 @@@ again
  		goto again;
  	}
  
+ 	BUG();
  	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
  	ClearPageChecked(page);
  out:
@@@ -1650,7 -1656,7 +1656,7 @@@ static int insert_reserved_file_extent(
  static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
  {
  	struct btrfs_root *root = BTRFS_I(inode)->root;
- 	struct btrfs_trans_handle *trans;
+ 	struct btrfs_trans_handle *trans = NULL;
  	struct btrfs_ordered_extent *ordered_extent = NULL;
  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  	struct extent_state *cached_state = NULL;
@@@ -1668,9 -1674,10 +1674,10 @@@
  		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
  		if (!ret) {
  			trans = btrfs_join_transaction(root, 1);
+ 			btrfs_set_trans_block_group(trans, inode);
+ 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  			ret = btrfs_update_inode(trans, root, inode);
  			BUG_ON(ret);
- 			btrfs_end_transaction(trans, root);
  		}
  		goto out;
  	}
@@@ -1680,6 -1687,8 +1687,8 @@@
  			 0, &cached_state, GFP_NOFS);
  
  	trans = btrfs_join_transaction(root, 1);
+ 	btrfs_set_trans_block_group(trans, inode);
+ 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
  		compressed = 1;
@@@ -1711,12 -1720,13 +1720,13 @@@
  	add_pending_csums(trans, inode, ordered_extent->file_offset,
  			  &ordered_extent->list);
  
- 	/* this also removes the ordered extent from the tree */
  	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
  	ret = btrfs_update_inode(trans, root, inode);
  	BUG_ON(ret);
- 	btrfs_end_transaction(trans, root);
  out:
+ 	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+ 	if (trans)
+ 		btrfs_end_transaction(trans, root);
  	/* once for us */
  	btrfs_put_ordered_extent(ordered_extent);
  	/* once for the tree */
@@@ -1838,7 -1848,7 +1848,7 @@@ static int btrfs_io_failed_hook(struct 
  
  	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
  						      failrec->last_mirror,
- 						      failrec->bio_flags);
+ 						      failrec->bio_flags, 0);
  	return 0;
  }
  
@@@ -1992,33 -2002,197 +2002,197 @@@ void btrfs_run_delayed_iputs(struct btr
  	up_read(&root->fs_info->cleanup_work_sem);
  }
  
+ /*
+  * calculate extra metadata reservation when snapshotting a subvolume
+  * contains orphan files.
+  */
+ void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+ 				struct btrfs_pending_snapshot *pending,
+ 				u64 *bytes_to_reserve)
+ {
+ 	struct btrfs_root *root;
+ 	struct btrfs_block_rsv *block_rsv;
+ 	u64 num_bytes;
+ 	int index;
+ 
+ 	root = pending->root;
+ 	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ 		return;
+ 
+ 	block_rsv = root->orphan_block_rsv;
+ 
+ 	/* orphan block reservation for the snapshot */
+ 	num_bytes = block_rsv->size;
+ 
+ 	/*
+ 	 * after the snapshot is created, COWing tree blocks may use more
+ 	 * space than it frees. So we should make sure there is enough
+ 	 * reserved space.
+ 	 */
+ 	index = trans->transid & 0x1;
+ 	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ 		num_bytes += block_rsv->size -
+ 			     (block_rsv->reserved + block_rsv->freed[index]);
+ 	}
+ 
+ 	*bytes_to_reserve += num_bytes;
+ }
+ 
+ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+ 				struct btrfs_pending_snapshot *pending)
+ {
+ 	struct btrfs_root *root = pending->root;
+ 	struct btrfs_root *snap = pending->snap;
+ 	struct btrfs_block_rsv *block_rsv;
+ 	u64 num_bytes;
+ 	int index;
+ 	int ret;
+ 
+ 	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ 		return;
+ 
+ 	/* refill source subvolume's orphan block reservation */
+ 	block_rsv = root->orphan_block_rsv;
+ 	index = trans->transid & 0x1;
+ 	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ 		num_bytes = block_rsv->size -
+ 			    (block_rsv->reserved + block_rsv->freed[index]);
+ 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ 					      root->orphan_block_rsv,
+ 					      num_bytes);
+ 		BUG_ON(ret);
+ 	}
+ 
+ 	/* setup orphan block reservation for the snapshot */
+ 	block_rsv = btrfs_alloc_block_rsv(snap);
+ 	BUG_ON(!block_rsv);
+ 
+ 	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+ 	snap->orphan_block_rsv = block_rsv;
+ 
+ 	num_bytes = root->orphan_block_rsv->size;
+ 	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ 				      block_rsv, num_bytes);
+ 	BUG_ON(ret);
+ 
+ #if 0
+ 	/* insert orphan item for the snapshot */
+ 	WARN_ON(!root->orphan_item_inserted);
+ 	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ 				       snap->root_key.objectid);
+ 	BUG_ON(ret);
+ 	snap->orphan_item_inserted = 1;
+ #endif
+ }
+ 
+ enum btrfs_orphan_cleanup_state {
+ 	ORPHAN_CLEANUP_STARTED	= 1,
+ 	ORPHAN_CLEANUP_DONE	= 2,
+ };
+ 
+ /*
+  * This is called in transaction commmit time. If there are no orphan
+  * files in the subvolume, it removes orphan item and frees block_rsv
+  * structure.
+  */
+ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ 			      struct btrfs_root *root)
+ {
+ 	int ret;
+ 
+ 	if (!list_empty(&root->orphan_list) ||
+ 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+ 		return;
+ 
+ 	if (root->orphan_item_inserted &&
+ 	    btrfs_root_refs(&root->root_item) > 0) {
+ 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+ 					    root->root_key.objectid);
+ 		BUG_ON(ret);
+ 		root->orphan_item_inserted = 0;
+ 	}
+ 
+ 	if (root->orphan_block_rsv) {
+ 		WARN_ON(root->orphan_block_rsv->size > 0);
+ 		btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ 		root->orphan_block_rsv = NULL;
+ 	}
+ }
+ 
  /*
   * This creates an orphan entry for the given inode in case something goes
   * wrong in the middle of an unlink/truncate.
+  *
+  * NOTE: caller of this function should reserve 5 units of metadata for
+  *	 this function.
   */
  int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
  {
  	struct btrfs_root *root = BTRFS_I(inode)->root;
- 	int ret = 0;
+ 	struct btrfs_block_rsv *block_rsv = NULL;
+ 	int reserve = 0;
+ 	int insert = 0;
+ 	int ret;
+ 
+ 	if (!root->orphan_block_rsv) {
+ 		block_rsv = btrfs_alloc_block_rsv(root);
+ 		BUG_ON(!block_rsv);
+ 	}
  
- 	spin_lock(&root->list_lock);
+ 	spin_lock(&root->orphan_lock);
+ 	if (!root->orphan_block_rsv) {
+ 		root->orphan_block_rsv = block_rsv;
+ 	} else if (block_rsv) {
+ 		btrfs_free_block_rsv(root, block_rsv);
+ 		block_rsv = NULL;
+ 	}
  
- 	/* already on the orphan list, we're good */
- 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- 		spin_unlock(&root->list_lock);
- 		return 0;
+ 	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+ 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ #if 0
+ 		/*
+ 		 * For proper ENOSPC handling, we should do orphan
+ 		 * cleanup when mounting. But this introduces backward
+ 		 * compatibility issue.
+ 		 */
+ 		if (!xchg(&root->orphan_item_inserted, 1))
+ 			insert = 2;
+ 		else
+ 			insert = 1;
+ #endif
+ 		insert = 1;
+ 	} else {
+ 		WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
  	}
  
- 	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ 	if (!BTRFS_I(inode)->orphan_meta_reserved) {
+ 		BTRFS_I(inode)->orphan_meta_reserved = 1;
+ 		reserve = 1;
+ 	}
+ 	spin_unlock(&root->orphan_lock);
  
- 	spin_unlock(&root->list_lock);
+ 	if (block_rsv)
+ 		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
  
- 	/*
- 	 * insert an orphan item to track this unlinked/truncated file
- 	 */
- 	ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ 	/* grab metadata reservation from transaction handle */
+ 	if (reserve) {
+ 		ret = btrfs_orphan_reserve_metadata(trans, inode);
+ 		BUG_ON(ret);
+ 	}
  
- 	return ret;
+ 	/* insert an orphan item to track this unlinked/truncated file */
+ 	if (insert >= 1) {
+ 		ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ 		BUG_ON(ret);
+ 	}
+ 
+ 	/* insert an orphan item to track subvolume contains orphan files */
+ 	if (insert >= 2) {
+ 		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ 					       root->root_key.objectid);
+ 		BUG_ON(ret);
+ 	}
+ 	return 0;
  }
  
  /*
@@@ -2028,26 -2202,31 +2202,31 @@@
  int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
  {
  	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	int delete_item = 0;
+ 	int release_rsv = 0;
  	int ret = 0;
  
- 	spin_lock(&root->list_lock);
- 
- 	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
- 		spin_unlock(&root->list_lock);
- 		return 0;
+ 	spin_lock(&root->orphan_lock);
+ 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ 		list_del_init(&BTRFS_I(inode)->i_orphan);
+ 		delete_item = 1;
  	}
  
- 	list_del_init(&BTRFS_I(inode)->i_orphan);
- 	if (!trans) {
- 		spin_unlock(&root->list_lock);
- 		return 0;
+ 	if (BTRFS_I(inode)->orphan_meta_reserved) {
+ 		BTRFS_I(inode)->orphan_meta_reserved = 0;
+ 		release_rsv = 1;
  	}
+ 	spin_unlock(&root->orphan_lock);
  
- 	spin_unlock(&root->list_lock);
+ 	if (trans && delete_item) {
+ 		ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ 		BUG_ON(ret);
+ 	}
  
- 	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ 	if (release_rsv)
+ 		btrfs_orphan_release_metadata(inode);
  
- 	return ret;
+ 	return 0;
  }
  
  /*
@@@ -2064,7 -2243,7 +2243,7 @@@ void btrfs_orphan_cleanup(struct btrfs_
  	struct inode *inode;
  	int ret = 0, nr_unlink = 0, nr_truncate = 0;
  
- 	if (!xchg(&root->clean_orphans, 0))
+ 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
  		return;
  
  	path = btrfs_alloc_path();
@@@ -2117,16 -2296,15 +2296,15 @@@
  		found_key.type = BTRFS_INODE_ITEM_KEY;
  		found_key.offset = 0;
  		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- 		if (IS_ERR(inode))
- 			break;
+ 		BUG_ON(IS_ERR(inode));
  
  		/*
  		 * add this inode to the orphan list so btrfs_orphan_del does
  		 * the proper thing when we hit it
  		 */
- 		spin_lock(&root->list_lock);
+ 		spin_lock(&root->orphan_lock);
  		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- 		spin_unlock(&root->list_lock);
+ 		spin_unlock(&root->orphan_lock);
  
  		/*
  		 * if this is a bad inode, means we actually succeeded in
@@@ -2135,7 -2313,7 +2313,7 @@@
  		 * do a destroy_inode
  		 */
  		if (is_bad_inode(inode)) {
- 			trans = btrfs_start_transaction(root, 1);
+ 			trans = btrfs_start_transaction(root, 0);
  			btrfs_orphan_del(trans, inode);
  			btrfs_end_transaction(trans, root);
  			iput(inode);
@@@ -2153,13 -2331,23 +2331,23 @@@
  		/* this will do delete_inode and everything for us */
  		iput(inode);
  	}
+ 	btrfs_free_path(path);
+ 
+ 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+ 
+ 	if (root->orphan_block_rsv)
+ 		btrfs_block_rsv_release(root, root->orphan_block_rsv,
+ 					(u64)-1);
+ 
+ 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ 		trans = btrfs_join_transaction(root, 1);
+ 		btrfs_end_transaction(trans, root);
+ 	}
  
  	if (nr_unlink)
  		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
  	if (nr_truncate)
  		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
- 
- 	btrfs_free_path(path);
  }
  
  /*
@@@ -2478,103 -2666,276 +2666,276 @@@ out
  	return ret;
  }
  
- static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ /* helper to check if there is any shared block in the path */
+ static int check_path_shared(struct btrfs_root *root,
+ 			     struct btrfs_path *path)
  {
- 	struct btrfs_root *root;
- 	struct btrfs_trans_handle *trans;
- 	struct inode *inode = dentry->d_inode;
+ 	struct extent_buffer *eb;
+ 	int level;
  	int ret;
- 	unsigned long nr = 0;
- 
- 	root = BTRFS_I(dir)->root;
- 
- 	/*
- 	 * 5 items for unlink inode
- 	 * 1 for orphan
- 	 */
- 	ret = btrfs_reserve_metadata_space(root, 6);
- 	if (ret)
- 		return ret;
+ 	u64 refs;
  
- 	trans = btrfs_start_transaction(root, 1);
- 	if (IS_ERR(trans)) {
- 		btrfs_unreserve_metadata_space(root, 6);
- 		return PTR_ERR(trans);
+ 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ 		if (!path->nodes[level])
+ 			break;
+ 		eb = path->nodes[level];
+ 		if (!btrfs_block_can_be_shared(root, eb))
+ 			continue;
+ 		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+ 					       &refs, NULL);
+ 		if (refs > 1)
+ 			return 1;
  	}
- 
- 	btrfs_set_trans_block_group(trans, dir);
- 
- 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
- 
- 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
- 				 dentry->d_name.name, dentry->d_name.len);
- 
- 	if (inode->i_nlink == 0)
- 		ret = btrfs_orphan_add(trans, inode);
- 
- 	nr = trans->blocks_used;
- 
- 	btrfs_end_transaction_throttle(trans, root);
- 	btrfs_unreserve_metadata_space(root, 6);
- 	btrfs_btree_balance_dirty(root, nr);
- 	return ret;
+ 	return 0;
  }
  
- int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- 			struct btrfs_root *root,
- 			struct inode *dir, u64 objectid,
- 			const char *name, int name_len)
+ /*
+  * helper to start transaction for unlink and rmdir.
+  *
+  * unlink and rmdir are special in btrfs, they do not always free space.
+  * so in enospc case, we should make sure they will free space before
+  * allowing them to use the global metadata reservation.
+  */
+ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+ 						       struct dentry *dentry)
  {
+ 	struct btrfs_trans_handle *trans;
+ 	struct btrfs_root *root = BTRFS_I(dir)->root;
  	struct btrfs_path *path;
- 	struct extent_buffer *leaf;
+ 	struct btrfs_inode_ref *ref;
  	struct btrfs_dir_item *di;
- 	struct btrfs_key key;
+ 	struct inode *inode = dentry->d_inode;
  	u64 index;
+ 	int check_link = 1;
+ 	int err = -ENOSPC;
  	int ret;
  
- 	path = btrfs_alloc_path();
- 	if (!path)
- 		return -ENOMEM;
+ 	trans = btrfs_start_transaction(root, 10);
+ 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ 		return trans;
  
- 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
- 				   name, name_len, -1);
- 	BUG_ON(!di || IS_ERR(di));
+ 	if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ 		return ERR_PTR(-ENOSPC);
  
- 	leaf = path->nodes[0];
- 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
- 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
- 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
- 	BUG_ON(ret);
- 	btrfs_release_path(root, path);
+ 	/* check if there is someone else holds reference */
+ 	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+ 		return ERR_PTR(-ENOSPC);
  
- 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
- 				 objectid, root->root_key.objectid,
- 				 dir->i_ino, &index, name, name_len);
- 	if (ret < 0) {
- 		BUG_ON(ret != -ENOENT);
- 		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
- 						 name, name_len);
- 		BUG_ON(!di || IS_ERR(di));
+ 	if (atomic_read(&inode->i_count) > 2)
+ 		return ERR_PTR(-ENOSPC);
  
- 		leaf = path->nodes[0];
- 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- 		btrfs_release_path(root, path);
- 		index = key.offset;
+ 	if (xchg(&root->fs_info->enospc_unlink, 1))
+ 		return ERR_PTR(-ENOSPC);
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path) {
+ 		root->fs_info->enospc_unlink = 0;
+ 		return ERR_PTR(-ENOMEM);
  	}
  
- 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
- 					 index, name, name_len, -1);
- 	BUG_ON(!di || IS_ERR(di));
+ 	trans = btrfs_start_transaction(root, 0);
+ 	if (IS_ERR(trans)) {
+ 		btrfs_free_path(path);
+ 		root->fs_info->enospc_unlink = 0;
+ 		return trans;
+ 	}
  
- 	leaf = path->nodes[0];
- 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
- 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
- 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
- 	BUG_ON(ret);
- 	btrfs_release_path(root, path);
+ 	path->skip_locking = 1;
+ 	path->search_commit_root = 1;
  
- 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
- 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ 	ret = btrfs_lookup_inode(trans, root, path,
+ 				&BTRFS_I(dir)->location, 0);
+ 	if (ret < 0) {
+ 		err = ret;
+ 		goto out;
+ 	}
+ 	if (ret == 0) {
+ 		if (check_path_shared(root, path))
+ 			goto out;
+ 	} else {
+ 		check_link = 0;
+ 	}
+ 	btrfs_release_path(root, path);
+ 
+ 	ret = btrfs_lookup_inode(trans, root, path,
+ 				&BTRFS_I(inode)->location, 0);
+ 	if (ret < 0) {
+ 		err = ret;
+ 		goto out;
+ 	}
+ 	if (ret == 0) {
+ 		if (check_path_shared(root, path))
+ 			goto out;
+ 	} else {
+ 		check_link = 0;
+ 	}
+ 	btrfs_release_path(root, path);
+ 
+ 	if (ret == 0 && S_ISREG(inode->i_mode)) {
+ 		ret = btrfs_lookup_file_extent(trans, root, path,
+ 					       inode->i_ino, (u64)-1, 0);
+ 		if (ret < 0) {
+ 			err = ret;
+ 			goto out;
+ 		}
+ 		BUG_ON(ret == 0);
+ 		if (check_path_shared(root, path))
+ 			goto out;
+ 		btrfs_release_path(root, path);
+ 	}
+ 
+ 	if (!check_link) {
+ 		err = 0;
+ 		goto out;
+ 	}
+ 
+ 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ 				dentry->d_name.name, dentry->d_name.len, 0);
+ 	if (IS_ERR(di)) {
+ 		err = PTR_ERR(di);
+ 		goto out;
+ 	}
+ 	if (di) {
+ 		if (check_path_shared(root, path))
+ 			goto out;
+ 	} else {
+ 		err = 0;
+ 		goto out;
+ 	}
+ 	btrfs_release_path(root, path);
+ 
+ 	ref = btrfs_lookup_inode_ref(trans, root, path,
+ 				dentry->d_name.name, dentry->d_name.len,
+ 				inode->i_ino, dir->i_ino, 0);
+ 	if (IS_ERR(ref)) {
+ 		err = PTR_ERR(ref);
+ 		goto out;
+ 	}
+ 	BUG_ON(!ref);
+ 	if (check_path_shared(root, path))
+ 		goto out;
+ 	index = btrfs_inode_ref_index(path->nodes[0], ref);
+ 	btrfs_release_path(root, path);
+ 
+ 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+ 				dentry->d_name.name, dentry->d_name.len, 0);
+ 	if (IS_ERR(di)) {
+ 		err = PTR_ERR(di);
+ 		goto out;
+ 	}
+ 	BUG_ON(ret == -ENOENT);
+ 	if (check_path_shared(root, path))
+ 		goto out;
+ 
+ 	err = 0;
+ out:
+ 	btrfs_free_path(path);
+ 	if (err) {
+ 		btrfs_end_transaction(trans, root);
+ 		root->fs_info->enospc_unlink = 0;
+ 		return ERR_PTR(err);
+ 	}
+ 
+ 	trans->block_rsv = &root->fs_info->global_block_rsv;
+ 	return trans;
+ }
+ 
+ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+ 			       struct btrfs_root *root)
+ {
+ 	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ 		BUG_ON(!root->fs_info->enospc_unlink);
+ 		root->fs_info->enospc_unlink = 0;
+ 	}
+ 	btrfs_end_transaction_throttle(trans, root);
+ }
+ 
+ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+ {
+ 	struct btrfs_root *root = BTRFS_I(dir)->root;
+ 	struct btrfs_trans_handle *trans;
+ 	struct inode *inode = dentry->d_inode;
+ 	int ret;
+ 	unsigned long nr = 0;
+ 
+ 	trans = __unlink_start_trans(dir, dentry);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
+ 
+ 	btrfs_set_trans_block_group(trans, dir);
+ 
+ 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+ 
+ 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ 				 dentry->d_name.name, dentry->d_name.len);
+ 	BUG_ON(ret);
+ 
+ 	if (inode->i_nlink == 0) {
+ 		ret = btrfs_orphan_add(trans, inode);
+ 		BUG_ON(ret);
+ 	}
+ 
+ 	nr = trans->blocks_used;
+ 	__unlink_end_trans(trans, root);
+ 	btrfs_btree_balance_dirty(root, nr);
+ 	return ret;
+ }
+ 
+ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ 			struct btrfs_root *root,
+ 			struct inode *dir, u64 objectid,
+ 			const char *name, int name_len)
+ {
+ 	struct btrfs_path *path;
+ 	struct extent_buffer *leaf;
+ 	struct btrfs_dir_item *di;
+ 	struct btrfs_key key;
+ 	u64 index;
+ 	int ret;
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+ 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ 				   name, name_len, -1);
+ 	BUG_ON(!di || IS_ERR(di));
+ 
+ 	leaf = path->nodes[0];
+ 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ 	BUG_ON(ret);
+ 	btrfs_release_path(root, path);
+ 
+ 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ 				 objectid, root->root_key.objectid,
+ 				 dir->i_ino, &index, name, name_len);
+ 	if (ret < 0) {
+ 		BUG_ON(ret != -ENOENT);
+ 		di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+ 						 name, name_len);
+ 		BUG_ON(!di || IS_ERR(di));
+ 
+ 		leaf = path->nodes[0];
+ 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ 		btrfs_release_path(root, path);
+ 		index = key.offset;
+ 	}
+ 
+ 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ 					 index, name, name_len, -1);
+ 	BUG_ON(!di || IS_ERR(di));
+ 
+ 	leaf = path->nodes[0];
+ 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ 	BUG_ON(ret);
+ 	btrfs_release_path(root, path);
+ 
+ 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
  	ret = btrfs_update_inode(trans, root, dir);
  	BUG_ON(ret);
  	dir->i_sb->s_dirt = 1;
@@@ -2587,7 -2948,6 +2948,6 @@@ static int btrfs_rmdir(struct inode *di
  {
  	struct inode *inode = dentry->d_inode;
  	int err = 0;
- 	int ret;
  	struct btrfs_root *root = BTRFS_I(dir)->root;
  	struct btrfs_trans_handle *trans;
  	unsigned long nr = 0;
@@@ -2596,15 -2956,9 +2956,9 @@@
  	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
  		return -ENOTEMPTY;
  
- 	ret = btrfs_reserve_metadata_space(root, 5);
- 	if (ret)
- 		return ret;
- 
- 	trans = btrfs_start_transaction(root, 1);
- 	if (IS_ERR(trans)) {
- 		btrfs_unreserve_metadata_space(root, 5);
+ 	trans = __unlink_start_trans(dir, dentry);
+ 	if (IS_ERR(trans))
  		return PTR_ERR(trans);
- 	}
  
  	btrfs_set_trans_block_group(trans, dir);
  
@@@ -2627,12 -2981,9 +2981,9 @@@
  		btrfs_i_size_write(inode, 0);
  out:
  	nr = trans->blocks_used;
- 	ret = btrfs_end_transaction_throttle(trans, root);
- 	btrfs_unreserve_metadata_space(root, 5);
+ 	__unlink_end_trans(trans, root);
  	btrfs_btree_balance_dirty(root, nr);
  
- 	if (ret && !err)
- 		err = ret;
  	return err;
  }
  
@@@ -3029,6 -3380,7 +3380,7 @@@ out
  	if (pending_del_nr) {
  		ret = btrfs_del_items(trans, root, path, pending_del_slot,
  				      pending_del_nr);
+ 		BUG_ON(ret);
  	}
  	btrfs_free_path(path);
  	return err;
@@@ -3056,11 -3408,7 +3408,7 @@@ static int btrfs_truncate_page(struct a
  
  	if ((offset & (blocksize - 1)) == 0)
  		goto out;
- 	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- 	if (ret)
- 		goto out;
- 
- 	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
  	if (ret)
  		goto out;
  
@@@ -3068,8 -3416,7 +3416,7 @@@
  again:
  	page = grab_cache_page(mapping, index);
  	if (!page) {
- 		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- 		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
  		goto out;
  	}
  
@@@ -3132,8 -3479,7 +3479,7 @@@
  
  out_unlock:
  	if (ret)
- 		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- 	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
  	unlock_page(page);
  	page_cache_release(page);
  out:
@@@ -3145,7 -3491,7 +3491,7 @@@ int btrfs_cont_expand(struct inode *ino
  	struct btrfs_trans_handle *trans;
  	struct btrfs_root *root = BTRFS_I(inode)->root;
  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- 	struct extent_map *em;
+ 	struct extent_map *em = NULL;
  	struct extent_state *cached_state = NULL;
  	u64 mask = root->sectorsize - 1;
  	u64 hole_start = (inode->i_size + mask) & ~mask;
@@@ -3183,11 -3529,11 +3529,11 @@@
  			u64 hint_byte = 0;
  			hole_size = last_byte - cur_offset;
  
- 			err = btrfs_reserve_metadata_space(root, 2);
- 			if (err)
+ 			trans = btrfs_start_transaction(root, 2);
+ 			if (IS_ERR(trans)) {
+ 				err = PTR_ERR(trans);
  				break;
- 
- 			trans = btrfs_start_transaction(root, 1);
+ 			}
  			btrfs_set_trans_block_group(trans, inode);
  
  			err = btrfs_drop_extents(trans, inode, cur_offset,
@@@ -3205,14 -3551,15 +3551,15 @@@
  					last_byte - 1, 0);
  
  			btrfs_end_transaction(trans, root);
- 			btrfs_unreserve_metadata_space(root, 2);
  		}
  		free_extent_map(em);
+ 		em = NULL;
  		cur_offset = last_byte;
  		if (cur_offset >= block_end)
  			break;
  	}
  
+ 	free_extent_map(em);
  	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
  			     GFP_NOFS);
  	return err;
@@@ -3239,11 -3586,10 +3586,10 @@@ static int btrfs_setattr_size(struct in
  		}
  	}
  
- 	ret = btrfs_reserve_metadata_space(root, 1);
- 	if (ret)
- 		return ret;
+ 	trans = btrfs_start_transaction(root, 5);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
  
- 	trans = btrfs_start_transaction(root, 1);
  	btrfs_set_trans_block_group(trans, inode);
  
  	ret = btrfs_orphan_add(trans, inode);
@@@ -3251,7 -3597,6 +3597,6 @@@
  
  	nr = trans->blocks_used;
  	btrfs_end_transaction(trans, root);
- 	btrfs_unreserve_metadata_space(root, 1);
  	btrfs_btree_balance_dirty(root, nr);
  
  	if (attr->ia_size > inode->i_size) {
@@@ -3264,8 -3609,11 +3609,11 @@@
  		i_size_write(inode, attr->ia_size);
  		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  
- 		trans = btrfs_start_transaction(root, 1);
+ 		trans = btrfs_start_transaction(root, 0);
+ 		BUG_ON(IS_ERR(trans));
  		btrfs_set_trans_block_group(trans, inode);
+ 		trans->block_rsv = root->orphan_block_rsv;
+ 		BUG_ON(!trans->block_rsv);
  
  		ret = btrfs_update_inode(trans, root, inode);
  		BUG_ON(ret);
@@@ -3345,10 -3693,21 +3693,21 @@@ void btrfs_delete_inode(struct inode *i
  	btrfs_i_size_write(inode, 0);
  
  	while (1) {
- 		trans = btrfs_start_transaction(root, 1);
+ 		trans = btrfs_start_transaction(root, 0);
+ 		BUG_ON(IS_ERR(trans));
  		btrfs_set_trans_block_group(trans, inode);
- 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ 		trans->block_rsv = root->orphan_block_rsv;
+ 
+ 		ret = btrfs_block_rsv_check(trans, root,
+ 					    root->orphan_block_rsv, 0, 5);
+ 		if (ret) {
+ 			BUG_ON(ret != -EAGAIN);
+ 			ret = btrfs_commit_transaction(trans, root);
+ 			BUG_ON(ret);
+ 			continue;
+ 		}
  
+ 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
  		if (ret != -EAGAIN)
  			break;
  
@@@ -3356,6 -3715,7 +3715,7 @@@
  		btrfs_end_transaction(trans, root);
  		trans = NULL;
  		btrfs_btree_balance_dirty(root, nr);
+ 
  	}
  
  	if (ret == 0) {
@@@ -3596,40 -3956,10 +3956,10 @@@ again
  	return 0;
  }
  
- static noinline void init_btrfs_i(struct inode *inode)
- {
- 	struct btrfs_inode *bi = BTRFS_I(inode);
- 
- 	bi->generation = 0;
- 	bi->sequence = 0;
- 	bi->last_trans = 0;
- 	bi->last_sub_trans = 0;
- 	bi->logged_trans = 0;
- 	bi->delalloc_bytes = 0;
- 	bi->reserved_bytes = 0;
- 	bi->disk_i_size = 0;
- 	bi->flags = 0;
- 	bi->index_cnt = (u64)-1;
- 	bi->last_unlink_trans = 0;
- 	bi->ordered_data_close = 0;
- 	bi->force_compress = 0;
- 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
- 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
- 			     inode->i_mapping, GFP_NOFS);
- 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
- 			     inode->i_mapping, GFP_NOFS);
- 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
- 	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
- 	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
- 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
- 	mutex_init(&BTRFS_I(inode)->log_mutex);
- }
- 
  static int btrfs_init_locked_inode(struct inode *inode, void *p)
  {
  	struct btrfs_iget_args *args = p;
  	inode->i_ino = args->ino;
- 	init_btrfs_i(inode);
  	BTRFS_I(inode)->root = args->root;
  	btrfs_set_inode_space_info(args->root, inode);
  	return 0;
@@@ -3692,8 -4022,6 +4022,6 @@@ static struct inode *new_simple_dir(str
  	if (!inode)
  		return ERR_PTR(-ENOMEM);
  
- 	init_btrfs_i(inode);
- 
  	BTRFS_I(inode)->root = root;
  	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
  	BTRFS_I(inode)->dummy_inode = 1;
@@@ -3950,7 -4278,7 +4278,7 @@@ int btrfs_write_inode(struct inode *ino
  	struct btrfs_trans_handle *trans;
  	int ret = 0;
  
- 	if (root->fs_info->btree_inode == inode)
+ 	if (BTRFS_I(inode)->dummy_inode)
  		return 0;
  
  	if (wbc->sync_mode == WB_SYNC_ALL) {
@@@ -3971,10 -4299,38 +4299,38 @@@ void btrfs_dirty_inode(struct inode *in
  {
  	struct btrfs_root *root = BTRFS_I(inode)->root;
  	struct btrfs_trans_handle *trans;
+ 	int ret;
+ 
+ 	if (BTRFS_I(inode)->dummy_inode)
+ 		return;
  
  	trans = btrfs_join_transaction(root, 1);
  	btrfs_set_trans_block_group(trans, inode);
- 	btrfs_update_inode(trans, root, inode);
+ 
+ 	ret = btrfs_update_inode(trans, root, inode);
+ 	if (ret && ret == -ENOSPC) {
+ 		/* whoops, lets try again with the full transaction */
+ 		btrfs_end_transaction(trans, root);
+ 		trans = btrfs_start_transaction(root, 1);
+ 		if (IS_ERR(trans)) {
+ 			if (printk_ratelimit()) {
+ 				printk(KERN_ERR "btrfs: fail to "
+ 				       "dirty  inode %lu error %ld\n",
+ 				       inode->i_ino, PTR_ERR(trans));
+ 			}
+ 			return;
+ 		}
+ 		btrfs_set_trans_block_group(trans, inode);
+ 
+ 		ret = btrfs_update_inode(trans, root, inode);
+ 		if (ret) {
+ 			if (printk_ratelimit()) {
+ 				printk(KERN_ERR "btrfs: fail to "
+ 				       "dirty  inode %lu error %d\n",
+ 				       inode->i_ino, ret);
+ 			}
+ 		}
+ 	}
  	btrfs_end_transaction(trans, root);
  }
  
@@@ -4092,7 -4448,6 +4448,6 @@@ static struct inode *btrfs_new_inode(st
  	 * btrfs_get_inode_index_count has an explanation for the magic
  	 * number
  	 */
- 	init_btrfs_i(inode);
  	BTRFS_I(inode)->index_cnt = 2;
  	BTRFS_I(inode)->root = root;
  	BTRFS_I(inode)->generation = trans->transid;
@@@ -4121,7 -4476,16 +4476,7 @@@
  	if (ret != 0)
  		goto fail;
  
 -	inode->i_uid = current_fsuid();
 -
 -	if (dir && (dir->i_mode & S_ISGID)) {
 -		inode->i_gid = dir->i_gid;
 -		if (S_ISDIR(mode))
 -			mode |= S_ISGID;
 -	} else
 -		inode->i_gid = current_fsgid();
 -
 -	inode->i_mode = mode;
 +	inode_init_owner(inode, dir, mode);
  	inode->i_ino = objectid;
  	inode_set_bytes(inode, 0);
  	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@@ -4247,26 -4611,21 +4602,21 @@@ static int btrfs_mknod(struct inode *di
  	if (!new_valid_dev(rdev))
  		return -EINVAL;
  
+ 	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ 	if (err)
+ 		return err;
+ 
  	/*
  	 * 2 for inode item and ref
  	 * 2 for dir items
  	 * 1 for xattr if selinux is on
  	 */
- 	err = btrfs_reserve_metadata_space(root, 5);
- 	if (err)
- 		return err;
+ 	trans = btrfs_start_transaction(root, 5);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
  
- 	trans = btrfs_start_transaction(root, 1);
- 	if (!trans)
- 		goto fail;
  	btrfs_set_trans_block_group(trans, dir);
  
- 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- 	if (err) {
- 		err = -ENOSPC;
- 		goto out_unlock;
- 	}
- 
  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
  				dentry->d_name.len,
  				dentry->d_parent->d_inode->i_ino, objectid,
@@@ -4295,13 -4654,11 +4645,11 @@@
  out_unlock:
  	nr = trans->blocks_used;
  	btrfs_end_transaction_throttle(trans, root);
- fail:
- 	btrfs_unreserve_metadata_space(root, 5);
+ 	btrfs_btree_balance_dirty(root, nr);
  	if (drop_inode) {
  		inode_dec_link_count(inode);
  		iput(inode);
  	}
- 	btrfs_btree_balance_dirty(root, nr);
  	return err;
  }
  
@@@ -4311,32 -4668,26 +4659,26 @@@ static int btrfs_create(struct inode *d
  	struct btrfs_trans_handle *trans;
  	struct btrfs_root *root = BTRFS_I(dir)->root;
  	struct inode *inode = NULL;
- 	int err;
  	int drop_inode = 0;
+ 	int err;
  	unsigned long nr = 0;
  	u64 objectid;
  	u64 index = 0;
  
+ 	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ 	if (err)
+ 		return err;
  	/*
  	 * 2 for inode item and ref
  	 * 2 for dir items
  	 * 1 for xattr if selinux is on
  	 */
- 	err = btrfs_reserve_metadata_space(root, 5);
- 	if (err)
- 		return err;
+ 	trans = btrfs_start_transaction(root, 5);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
  
- 	trans = btrfs_start_transaction(root, 1);
- 	if (!trans)
- 		goto fail;
  	btrfs_set_trans_block_group(trans, dir);
  
- 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- 	if (err) {
- 		err = -ENOSPC;
- 		goto out_unlock;
- 	}
- 
  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
  				dentry->d_name.len,
  				dentry->d_parent->d_inode->i_ino,
@@@ -4368,8 -4719,6 +4710,6 @@@
  out_unlock:
  	nr = trans->blocks_used;
  	btrfs_end_transaction_throttle(trans, root);
- fail:
- 	btrfs_unreserve_metadata_space(root, 5);
  	if (drop_inode) {
  		inode_dec_link_count(inode);
  		iput(inode);
@@@ -4396,21 -4745,21 +4736,21 @@@ static int btrfs_link(struct dentry *ol
  	if (root->objectid != BTRFS_I(inode)->root->objectid)
  		return -EPERM;
  
- 	/*
- 	 * 1 item for inode ref
- 	 * 2 items for dir items
- 	 */
- 	err = btrfs_reserve_metadata_space(root, 3);
- 	if (err)
- 		return err;
- 
  	btrfs_inc_nlink(inode);
  
  	err = btrfs_set_inode_index(dir, &index);
  	if (err)
  		goto fail;
  
- 	trans = btrfs_start_transaction(root, 1);
+ 	/*
+ 	 * 1 item for inode ref
+ 	 * 2 items for dir items
+ 	 */
+ 	trans = btrfs_start_transaction(root, 3);
+ 	if (IS_ERR(trans)) {
+ 		err = PTR_ERR(trans);
+ 		goto fail;
+ 	}
  
  	btrfs_set_trans_block_group(trans, dir);
  	atomic_inc(&inode->i_count);
@@@ -4429,7 -4778,6 +4769,6 @@@
  	nr = trans->blocks_used;
  	btrfs_end_transaction_throttle(trans, root);
  fail:
- 	btrfs_unreserve_metadata_space(root, 3);
  	if (drop_inode) {
  		inode_dec_link_count(inode);
  		iput(inode);
@@@ -4449,28 -4797,20 +4788,20 @@@ static int btrfs_mkdir(struct inode *di
  	u64 index = 0;
  	unsigned long nr = 1;
  
+ 	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ 	if (err)
+ 		return err;
+ 
  	/*
  	 * 2 items for inode and ref
  	 * 2 items for dir items
  	 * 1 for xattr if selinux is on
  	 */
- 	err = btrfs_reserve_metadata_space(root, 5);
- 	if (err)
- 		return err;
- 
- 	trans = btrfs_start_transaction(root, 1);
- 	if (!trans) {
- 		err = -ENOMEM;
- 		goto out_unlock;
- 	}
+ 	trans = btrfs_start_transaction(root, 5);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
  	btrfs_set_trans_block_group(trans, dir);
  
- 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- 	if (err) {
- 		err = -ENOSPC;
- 		goto out_fail;
- 	}
- 
  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
  				dentry->d_name.len,
  				dentry->d_parent->d_inode->i_ino, objectid,
@@@ -4510,9 -4850,6 +4841,6 @@@
  out_fail:
  	nr = trans->blocks_used;
  	btrfs_end_transaction_throttle(trans, root);
- 
- out_unlock:
- 	btrfs_unreserve_metadata_space(root, 5);
  	if (drop_on_err)
  		iput(inode);
  	btrfs_btree_balance_dirty(root, nr);
@@@ -4770,6 -5107,7 +5098,7 @@@ again
  			}
  			flush_dcache_page(page);
  		} else if (create && PageUptodate(page)) {
+ 			WARN_ON(1);
  			if (!trans) {
  				kunmap(page);
  				free_extent_map(em);
@@@ -4866,11 -5204,651 +5195,651 @@@ out
  	return em;
  }
  
+ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+ 						  u64 start, u64 len)
+ {
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_trans_handle *trans;
+ 	struct extent_map *em;
+ 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ 	struct btrfs_key ins;
+ 	u64 alloc_hint;
+ 	int ret;
+ 
+ 	btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+ 
+ 	trans = btrfs_join_transaction(root, 0);
+ 	if (!trans)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ 
+ 	alloc_hint = get_extent_allocation_hint(inode, start, len);
+ 	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+ 				   alloc_hint, (u64)-1, &ins, 1);
+ 	if (ret) {
+ 		em = ERR_PTR(ret);
+ 		goto out;
+ 	}
+ 
+ 	em = alloc_extent_map(GFP_NOFS);
+ 	if (!em) {
+ 		em = ERR_PTR(-ENOMEM);
+ 		goto out;
+ 	}
+ 
+ 	em->start = start;
+ 	em->orig_start = em->start;
+ 	em->len = ins.offset;
+ 
+ 	em->block_start = ins.objectid;
+ 	em->block_len = ins.offset;
+ 	em->bdev = root->fs_info->fs_devices->latest_bdev;
+ 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ 
+ 	while (1) {
+ 		write_lock(&em_tree->lock);
+ 		ret = add_extent_mapping(em_tree, em);
+ 		write_unlock(&em_tree->lock);
+ 		if (ret != -EEXIST)
+ 			break;
+ 		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+ 	}
+ 
+ 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+ 					   ins.offset, ins.offset, 0);
+ 	if (ret) {
+ 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ 		em = ERR_PTR(ret);
+ 	}
+ out:
+ 	btrfs_end_transaction(trans, root);
+ 	return em;
+ }
+ 
+ /*
+  * returns 1 when the nocow is safe, < 1 on error, 0 if the
+  * block must be cow'd
+  */
+ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+ 				      struct inode *inode, u64 offset, u64 len)
+ {
+ 	struct btrfs_path *path;
+ 	int ret;
+ 	struct extent_buffer *leaf;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_file_extent_item *fi;
+ 	struct btrfs_key key;
+ 	u64 disk_bytenr;
+ 	u64 backref_offset;
+ 	u64 extent_end;
+ 	u64 num_bytes;
+ 	int slot;
+ 	int found_type;
+ 
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+ 	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ 				       offset, 0);
+ 	if (ret < 0)
+ 		goto out;
+ 
+ 	slot = path->slots[0];
+ 	if (ret == 1) {
+ 		if (slot == 0) {
+ 			/* can't find the item, must cow */
+ 			ret = 0;
+ 			goto out;
+ 		}
+ 		slot--;
+ 	}
+ 	ret = 0;
+ 	leaf = path->nodes[0];
+ 	btrfs_item_key_to_cpu(leaf, &key, slot);
+ 	if (key.objectid != inode->i_ino ||
+ 	    key.type != BTRFS_EXTENT_DATA_KEY) {
+ 		/* not our file or wrong item type, must cow */
+ 		goto out;
+ 	}
+ 
+ 	if (key.offset > offset) {
+ 		/* Wrong offset, must cow */
+ 		goto out;
+ 	}
+ 
+ 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ 	found_type = btrfs_file_extent_type(leaf, fi);
+ 	if (found_type != BTRFS_FILE_EXTENT_REG &&
+ 	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+ 		/* not a regular extent, must cow */
+ 		goto out;
+ 	}
+ 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ 	backref_offset = btrfs_file_extent_offset(leaf, fi);
+ 
+ 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ 	if (extent_end < offset + len) {
+ 		/* extent doesn't include our full range, must cow */
+ 		goto out;
+ 	}
+ 
+ 	if (btrfs_extent_readonly(root, disk_bytenr))
+ 		goto out;
+ 
+ 	/*
+ 	 * look for other files referencing this extent, if we
+ 	 * find any we must cow
+ 	 */
+ 	if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ 				  key.offset - backref_offset, disk_bytenr))
+ 		goto out;
+ 
+ 	/*
+ 	 * adjust disk_bytenr and num_bytes to cover just the bytes
+ 	 * in this extent we are about to write.  If there
+ 	 * are any csums in that range we have to cow in order
+ 	 * to keep the csums correct
+ 	 */
+ 	disk_bytenr += backref_offset;
+ 	disk_bytenr += offset - key.offset;
+ 	num_bytes = min(offset + len, extent_end) - offset;
+ 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ 				goto out;
+ 	/*
+ 	 * all of the above have passed, it is safe to overwrite this extent
+ 	 * without cow
+ 	 */
+ 	ret = 1;
+ out:
+ 	btrfs_free_path(path);
+ 	return ret;
+ }
+ 
+ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+ 				   struct buffer_head *bh_result, int create)
+ {
+ 	struct extent_map *em;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	u64 start = iblock << inode->i_blkbits;
+ 	u64 len = bh_result->b_size;
+ 	struct btrfs_trans_handle *trans;
+ 
+ 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ 	if (IS_ERR(em))
+ 		return PTR_ERR(em);
+ 
+ 	/*
+ 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ 	 * io.  INLINE is special, and we could probably kludge it in here, but
+ 	 * it's still buffered so for safety lets just fall back to the generic
+ 	 * buffered path.
+ 	 *
+ 	 * For COMPRESSED we _have_ to read the entire extent in so we can
+ 	 * decompress it, so there will be buffering required no matter what we
+ 	 * do, so go ahead and fallback to buffered.
+ 	 *
+ 	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+ 	 * to buffered IO.  Don't blame me, this is the price we pay for using
+ 	 * the generic code.
+ 	 */
+ 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ 	    em->block_start == EXTENT_MAP_INLINE) {
+ 		free_extent_map(em);
+ 		return -ENOTBLK;
+ 	}
+ 
+ 	/* Just a good old fashioned hole, return */
+ 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+ 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ 		free_extent_map(em);
+ 		/* DIO will do one hole at a time, so just unlock a sector */
+ 		unlock_extent(&BTRFS_I(inode)->io_tree, start,
+ 			      start + root->sectorsize - 1, GFP_NOFS);
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * We don't allocate a new extent in the following cases
+ 	 *
+ 	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+ 	 * existing extent.
+ 	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
+ 	 * just use the extent.
+ 	 *
+ 	 */
+ 	if (!create) {
+ 		len = em->len - (start - em->start);
+ 		goto map;
+ 	}
+ 
+ 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ 	     em->block_start != EXTENT_MAP_HOLE)) {
+ 		int type;
+ 		int ret;
+ 		u64 block_start;
+ 
+ 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ 			type = BTRFS_ORDERED_PREALLOC;
+ 		else
+ 			type = BTRFS_ORDERED_NOCOW;
+ 		len = min(len, em->len - (start - em->start));
+ 		block_start = em->block_start + (start - em->start);
+ 
+ 		/*
+ 		 * we're not going to log anything, but we do need
+ 		 * to make sure the current transaction stays open
+ 		 * while we look for nocow cross refs
+ 		 */
+ 		trans = btrfs_join_transaction(root, 0);
+ 		if (!trans)
+ 			goto must_cow;
+ 
+ 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
+ 			ret = btrfs_add_ordered_extent_dio(inode, start,
+ 					   block_start, len, len, type);
+ 			btrfs_end_transaction(trans, root);
+ 			if (ret) {
+ 				free_extent_map(em);
+ 				return ret;
+ 			}
+ 			goto unlock;
+ 		}
+ 		btrfs_end_transaction(trans, root);
+ 	}
+ must_cow:
+ 	/*
+ 	 * this will cow the extent, reset the len in case we changed
+ 	 * it above
+ 	 */
+ 	len = bh_result->b_size;
+ 	free_extent_map(em);
+ 	em = btrfs_new_extent_direct(inode, start, len);
+ 	if (IS_ERR(em))
+ 		return PTR_ERR(em);
+ 	len = min(len, em->len - (start - em->start));
+ unlock:
+ 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ 			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+ 			  0, NULL, GFP_NOFS);
+ map:
+ 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ 		inode->i_blkbits;
+ 	bh_result->b_size = len;
+ 	bh_result->b_bdev = em->bdev;
+ 	set_buffer_mapped(bh_result);
+ 	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ 		set_buffer_new(bh_result);
+ 
+ 	free_extent_map(em);
+ 
+ 	return 0;
+ }
+ 
+ struct btrfs_dio_private {
+ 	struct inode *inode;
+ 	u64 logical_offset;
+ 	u64 disk_bytenr;
+ 	u64 bytes;
+ 	u32 *csums;
+ 	void *private;
+ };
+ 
+ static void btrfs_endio_direct_read(struct bio *bio, int err)
+ {
+ 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+ 	struct bio_vec *bvec = bio->bi_io_vec;
+ 	struct btrfs_dio_private *dip = bio->bi_private;
+ 	struct inode *inode = dip->inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	u64 start;
+ 	u32 *private = dip->csums;
+ 
+ 	start = dip->logical_offset;
+ 	do {
+ 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ 			struct page *page = bvec->bv_page;
+ 			char *kaddr;
+ 			u32 csum = ~(u32)0;
+ 			unsigned long flags;
+ 
+ 			local_irq_save(flags);
+ 			kaddr = kmap_atomic(page, KM_IRQ0);
+ 			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+ 					       csum, bvec->bv_len);
+ 			btrfs_csum_final(csum, (char *)&csum);
+ 			kunmap_atomic(kaddr, KM_IRQ0);
+ 			local_irq_restore(flags);
+ 
+ 			flush_dcache_page(bvec->bv_page);
+ 			if (csum != *private) {
+ 				printk(KERN_ERR "btrfs csum failed ino %lu off"
+ 				      " %llu csum %u private %u\n",
+ 				      inode->i_ino, (unsigned long long)start,
+ 				      csum, *private);
+ 				err = -EIO;
+ 			}
+ 		}
+ 
+ 		start += bvec->bv_len;
+ 		private++;
+ 		bvec++;
+ 	} while (bvec <= bvec_end);
+ 
+ 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+ 		      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+ 	bio->bi_private = dip->private;
+ 
+ 	kfree(dip->csums);
+ 	kfree(dip);
+ 	dio_end_io(bio, err);
+ }
+ 
+ static void btrfs_endio_direct_write(struct bio *bio, int err)
+ {
+ 	struct btrfs_dio_private *dip = bio->bi_private;
+ 	struct inode *inode = dip->inode;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_trans_handle *trans;
+ 	struct btrfs_ordered_extent *ordered = NULL;
+ 	struct extent_state *cached_state = NULL;
+ 	int ret;
+ 
+ 	if (err)
+ 		goto out_done;
+ 
+ 	ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+ 					     dip->logical_offset, dip->bytes);
+ 	if (!ret)
+ 		goto out_done;
+ 
+ 	BUG_ON(!ordered);
+ 
+ 	trans = btrfs_join_transaction(root, 1);
+ 	if (!trans) {
+ 		err = -ENOMEM;
+ 		goto out;
+ 	}
+ 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ 
+ 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+ 		if (!ret)
+ 			ret = btrfs_update_inode(trans, root, inode);
+ 		err = ret;
+ 		goto out;
+ 	}
+ 
+ 	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ 			 ordered->file_offset + ordered->len - 1, 0,
+ 			 &cached_state, GFP_NOFS);
+ 
+ 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+ 		ret = btrfs_mark_extent_written(trans, inode,
+ 						ordered->file_offset,
+ 						ordered->file_offset +
+ 						ordered->len);
+ 		if (ret) {
+ 			err = ret;
+ 			goto out_unlock;
+ 		}
+ 	} else {
+ 		ret = insert_reserved_file_extent(trans, inode,
+ 						  ordered->file_offset,
+ 						  ordered->start,
+ 						  ordered->disk_len,
+ 						  ordered->len,
+ 						  ordered->len,
+ 						  0, 0, 0,
+ 						  BTRFS_FILE_EXTENT_REG);
+ 		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ 				   ordered->file_offset, ordered->len);
+ 		if (ret) {
+ 			err = ret;
+ 			WARN_ON(1);
+ 			goto out_unlock;
+ 		}
+ 	}
+ 
+ 	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+ 	btrfs_ordered_update_i_size(inode, 0, ordered);
+ 	btrfs_update_inode(trans, root, inode);
+ out_unlock:
+ 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ 			     ordered->file_offset + ordered->len - 1,
+ 			     &cached_state, GFP_NOFS);
+ out:
+ 	btrfs_delalloc_release_metadata(inode, ordered->len);
+ 	btrfs_end_transaction(trans, root);
+ 	btrfs_put_ordered_extent(ordered);
+ 	btrfs_put_ordered_extent(ordered);
+ out_done:
+ 	bio->bi_private = dip->private;
+ 
+ 	kfree(dip->csums);
+ 	kfree(dip);
+ 	dio_end_io(bio, err);
+ }
+ 
+ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+ 				    struct bio *bio, int mirror_num,
+ 				    unsigned long bio_flags, u64 offset)
+ {
+ 	int ret;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+ 	BUG_ON(ret);
+ 	return 0;
+ }
+ 
+ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+ 				loff_t file_offset)
+ {
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+ 	struct btrfs_dio_private *dip;
+ 	struct bio_vec *bvec = bio->bi_io_vec;
+ 	u64 start;
+ 	int skip_sum;
+ 	int write = rw & (1 << BIO_RW);
+ 	int ret = 0;
+ 
+ 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ 
+ 	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+ 	if (!dip) {
+ 		ret = -ENOMEM;
+ 		goto free_ordered;
+ 	}
+ 	dip->csums = NULL;
+ 
+ 	if (!skip_sum) {
+ 		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+ 		if (!dip->csums) {
+ 			ret = -ENOMEM;
+ 			goto free_ordered;
+ 		}
+ 	}
+ 
+ 	dip->private = bio->bi_private;
+ 	dip->inode = inode;
+ 	dip->logical_offset = file_offset;
+ 
+ 	start = dip->logical_offset;
+ 	dip->bytes = 0;
+ 	do {
+ 		dip->bytes += bvec->bv_len;
+ 		bvec++;
+ 	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+ 
+ 	dip->disk_bytenr = (u64)bio->bi_sector << 9;
+ 	bio->bi_private = dip;
+ 
+ 	if (write)
+ 		bio->bi_end_io = btrfs_endio_direct_write;
+ 	else
+ 		bio->bi_end_io = btrfs_endio_direct_read;
+ 
+ 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ 	if (ret)
+ 		goto out_err;
+ 
+ 	if (write && !skip_sum) {
+ 		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ 				   inode, rw, bio, 0, 0,
+ 				   dip->logical_offset,
+ 				   __btrfs_submit_bio_start_direct_io,
+ 				   __btrfs_submit_bio_done);
+ 		if (ret)
+ 			goto out_err;
+ 		return;
+ 	} else if (!skip_sum)
+ 		btrfs_lookup_bio_sums_dio(root, inode, bio,
+ 					  dip->logical_offset, dip->csums);
+ 
+ 	ret = btrfs_map_bio(root, rw, bio, 0, 1);
+ 	if (ret)
+ 		goto out_err;
+ 	return;
+ out_err:
+ 	kfree(dip->csums);
+ 	kfree(dip);
+ free_ordered:
+ 	/*
+ 	 * If this is a write, we need to clean up the reserved space and kill
+ 	 * the ordered extent.
+ 	 */
+ 	if (write) {
+ 		struct btrfs_ordered_extent *ordered;
+ 		ordered = btrfs_lookup_ordered_extent(inode,
+ 						      dip->logical_offset);
+ 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+ 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ 			btrfs_free_reserved_extent(root, ordered->start,
+ 						   ordered->disk_len);
+ 		btrfs_put_ordered_extent(ordered);
+ 		btrfs_put_ordered_extent(ordered);
+ 	}
+ 	bio_endio(bio, ret);
+ }
+ 
+ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+ 			const struct iovec *iov, loff_t offset,
+ 			unsigned long nr_segs)
+ {
+ 	int seg;
+ 	size_t size;
+ 	unsigned long addr;
+ 	unsigned blocksize_mask = root->sectorsize - 1;
+ 	ssize_t retval = -EINVAL;
+ 	loff_t end = offset;
+ 
+ 	if (offset & blocksize_mask)
+ 		goto out;
+ 
+ 	/* Check the memory alignment.  Blocks cannot straddle pages */
+ 	for (seg = 0; seg < nr_segs; seg++) {
+ 		addr = (unsigned long)iov[seg].iov_base;
+ 		size = iov[seg].iov_len;
+ 		end += size;
+ 		if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+ 			goto out;
+ 	}
+ 	retval = 0;
+ out:
+ 	return retval;
+ }
  static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
  			const struct iovec *iov, loff_t offset,
  			unsigned long nr_segs)
  {
- 	return -EINVAL;
+ 	struct file *file = iocb->ki_filp;
+ 	struct inode *inode = file->f_mapping->host;
+ 	struct btrfs_ordered_extent *ordered;
+ 	struct extent_state *cached_state = NULL;
+ 	u64 lockstart, lockend;
+ 	ssize_t ret;
+ 	int writing = rw & WRITE;
+ 	int write_bits = 0;
+ 	size_t count = iov_length(iov, nr_segs);
+ 
+ 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+ 			    offset, nr_segs)) {
+ 		return 0;
+ 	}
+ 
+ 	lockstart = offset;
+ 	lockend = offset + count - 1;
+ 
+ 	if (writing) {
+ 		ret = btrfs_delalloc_reserve_space(inode, count);
+ 		if (ret)
+ 			goto out;
+ 	}
+ 
+ 	while (1) {
+ 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 				 0, &cached_state, GFP_NOFS);
+ 		/*
+ 		 * We're concerned with the entire range that we're going to be
+ 		 * doing DIO to, so we need to make sure theres no ordered
+ 		 * extents in this range.
+ 		 */
+ 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ 						     lockend - lockstart + 1);
+ 		if (!ordered)
+ 			break;
+ 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 				     &cached_state, GFP_NOFS);
+ 		btrfs_start_ordered_extent(inode, ordered, 1);
+ 		btrfs_put_ordered_extent(ordered);
+ 		cond_resched();
+ 	}
+ 
+ 	/*
+ 	 * we don't use btrfs_set_extent_delalloc because we don't want
+ 	 * the dirty or uptodate bits
+ 	 */
+ 	if (writing) {
+ 		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+ 		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 				     EXTENT_DELALLOC, 0, NULL, &cached_state,
+ 				     GFP_NOFS);
+ 		if (ret) {
+ 			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ 					 lockend, EXTENT_LOCKED | write_bits,
+ 					 1, 0, &cached_state, GFP_NOFS);
+ 			goto out;
+ 		}
+ 	}
+ 
+ 	free_extent_state(cached_state);
+ 	cached_state = NULL;
+ 
+ 	ret = __blockdev_direct_IO(rw, iocb, inode,
+ 		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ 		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ 		   btrfs_submit_direct, 0);
+ 
+ 	if (ret < 0 && ret != -EIOCBQUEUED) {
+ 		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+ 			      offset + iov_length(iov, nr_segs) - 1,
+ 			      EXTENT_LOCKED | write_bits, 1, 0,
+ 			      &cached_state, GFP_NOFS);
+ 	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+ 		/*
+ 		 * We're falling back to buffered, unlock the section we didn't
+ 		 * do IO on.
+ 		 */
+ 		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+ 			      offset + iov_length(iov, nr_segs) - 1,
+ 			      EXTENT_LOCKED | write_bits, 1, 0,
+ 			      &cached_state, GFP_NOFS);
+ 	}
+ out:
+ 	free_extent_state(cached_state);
+ 	return ret;
  }
  
  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@@ -5034,7 -6012,7 +6003,7 @@@ int btrfs_page_mkwrite(struct vm_area_s
  	u64 page_start;
  	u64 page_end;
  
- 	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
  	if (ret) {
  		if (ret == -ENOMEM)
  			ret = VM_FAULT_OOM;
@@@ -5043,13 -6021,6 +6012,6 @@@
  		goto out;
  	}
  
- 	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- 	if (ret) {
- 		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- 		ret = VM_FAULT_SIGBUS;
- 		goto out;
- 	}
- 
  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
  again:
  	lock_page(page);
@@@ -5059,7 -6030,6 +6021,6 @@@
  
  	if ((page->mapping != inode->i_mapping) ||
  	    (page_start >= size)) {
- 		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
  		/* page got truncated out from underneath us */
  		goto out_unlock;
  	}
@@@ -5100,7 -6070,6 +6061,6 @@@
  		unlock_extent_cached(io_tree, page_start, page_end,
  				     &cached_state, GFP_NOFS);
  		ret = VM_FAULT_SIGBUS;
- 		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
  		goto out_unlock;
  	}
  	ret = 0;
@@@ -5127,10 -6096,10 +6087,10 @@@
  	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
  
  out_unlock:
- 	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
  	if (!ret)
  		return VM_FAULT_LOCKED;
  	unlock_page(page);
+ 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
  out:
  	return ret;
  }
@@@ -5155,8 -6124,10 +6115,10 @@@ static void btrfs_truncate(struct inod
  	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
  	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  
- 	trans = btrfs_start_transaction(root, 1);
+ 	trans = btrfs_start_transaction(root, 0);
+ 	BUG_ON(IS_ERR(trans));
  	btrfs_set_trans_block_group(trans, inode);
+ 	trans->block_rsv = root->orphan_block_rsv;
  
  	/*
  	 * setattr is responsible for setting the ordered_data_close flag,
@@@ -5179,6 -6150,23 +6141,23 @@@
  		btrfs_add_ordered_operation(trans, root, inode);
  
  	while (1) {
+ 		if (!trans) {
+ 			trans = btrfs_start_transaction(root, 0);
+ 			BUG_ON(IS_ERR(trans));
+ 			btrfs_set_trans_block_group(trans, inode);
+ 			trans->block_rsv = root->orphan_block_rsv;
+ 		}
+ 
+ 		ret = btrfs_block_rsv_check(trans, root,
+ 					    root->orphan_block_rsv, 0, 5);
+ 		if (ret) {
+ 			BUG_ON(ret != -EAGAIN);
+ 			ret = btrfs_commit_transaction(trans, root);
+ 			BUG_ON(ret);
+ 			trans = NULL;
+ 			continue;
+ 		}
+ 
  		ret = btrfs_truncate_inode_items(trans, root, inode,
  						 inode->i_size,
  						 BTRFS_EXTENT_DATA_KEY);
@@@ -5190,10 -6178,8 +6169,8 @@@
  
  		nr = trans->blocks_used;
  		btrfs_end_transaction(trans, root);
+ 		trans = NULL;
  		btrfs_btree_balance_dirty(root, nr);
- 
- 		trans = btrfs_start_transaction(root, 1);
- 		btrfs_set_trans_block_group(trans, inode);
  	}
  
  	if (ret == 0 && inode->i_nlink > 0) {
@@@ -5254,21 -6240,47 +6231,47 @@@ unsigned long btrfs_force_ra(struct add
  struct inode *btrfs_alloc_inode(struct super_block *sb)
  {
  	struct btrfs_inode *ei;
+ 	struct inode *inode;
  
  	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
  	if (!ei)
  		return NULL;
+ 
+ 	ei->root = NULL;
+ 	ei->space_info = NULL;
+ 	ei->generation = 0;
+ 	ei->sequence = 0;
  	ei->last_trans = 0;
  	ei->last_sub_trans = 0;
  	ei->logged_trans = 0;
- 	ei->outstanding_extents = 0;
- 	ei->reserved_extents = 0;
- 	ei->root = NULL;
+ 	ei->delalloc_bytes = 0;
+ 	ei->reserved_bytes = 0;
+ 	ei->disk_i_size = 0;
+ 	ei->flags = 0;
+ 	ei->index_cnt = (u64)-1;
+ 	ei->last_unlink_trans = 0;
+ 
  	spin_lock_init(&ei->accounting_lock);
+ 	atomic_set(&ei->outstanding_extents, 0);
+ 	ei->reserved_extents = 0;
+ 
+ 	ei->ordered_data_close = 0;
+ 	ei->orphan_meta_reserved = 0;
+ 	ei->dummy_inode = 0;
+ 	ei->force_compress = 0;
+ 
+ 	inode = &ei->vfs_inode;
+ 	extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+ 	extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+ 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+ 	mutex_init(&ei->log_mutex);
  	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
  	INIT_LIST_HEAD(&ei->i_orphan);
+ 	INIT_LIST_HEAD(&ei->delalloc_inodes);
  	INIT_LIST_HEAD(&ei->ordered_operations);
- 	return &ei->vfs_inode;
+ 	RB_CLEAR_NODE(&ei->rb_node);
+ 
+ 	return inode;
  }
  
  void btrfs_destroy_inode(struct inode *inode)
@@@ -5278,6 -6290,8 +6281,8 @@@
  
  	WARN_ON(!list_empty(&inode->i_dentry));
  	WARN_ON(inode->i_data.nrpages);
+ 	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+ 	WARN_ON(BTRFS_I(inode)->reserved_extents);
  
  	/*
  	 * This can happen where we create an inode, but somebody else also
@@@ -5298,13 -6312,13 +6303,13 @@@
  		spin_unlock(&root->fs_info->ordered_extent_lock);
  	}
  
- 	spin_lock(&root->list_lock);
+ 	spin_lock(&root->orphan_lock);
  	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
  		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
  		       inode->i_ino);
  		list_del_init(&BTRFS_I(inode)->i_orphan);
  	}
- 	spin_unlock(&root->list_lock);
+ 	spin_unlock(&root->orphan_lock);
  
  	while (1) {
  		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@@ -5425,19 -6439,6 +6430,6 @@@ static int btrfs_rename(struct inode *o
  	if (S_ISDIR(old_inode->i_mode) && new_inode &&
  	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
  		return -ENOTEMPTY;
- 
- 	/*
- 	 * We want to reserve the absolute worst case amount of items.  So if
- 	 * both inodes are subvols and we need to unlink them then that would
- 	 * require 4 item modifications, but if they are both normal inodes it
- 	 * would require 5 item modifications, so we'll assume their normal
- 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- 	 * should cover the worst case number of items we'll modify.
- 	 */
- 	ret = btrfs_reserve_metadata_space(root, 11);
- 	if (ret)
- 		return ret;
- 
  	/*
  	 * we're using rename to replace one file with another.
  	 * and the replacement file is large.  Start IO on it now so
@@@ -5450,8 -6451,18 +6442,18 @@@
  	/* close the racy window with snapshot create/destroy ioctl */
  	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
  		down_read(&root->fs_info->subvol_sem);
+ 	/*
+ 	 * We want to reserve the absolute worst case amount of items.  So if
+ 	 * both inodes are subvols and we need to unlink them then that would
+ 	 * require 4 item modifications, but if they are both normal inodes it
+ 	 * would require 5 item modifications, so we'll assume their normal
+ 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ 	 * should cover the worst case number of items we'll modify.
+ 	 */
+ 	trans = btrfs_start_transaction(root, 20);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
  
- 	trans = btrfs_start_transaction(root, 1);
  	btrfs_set_trans_block_group(trans, new_dir);
  
  	if (dest != root)
@@@ -5550,7 -6561,6 +6552,6 @@@ out_fail
  	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
  		up_read(&root->fs_info->subvol_sem);
  
- 	btrfs_unreserve_metadata_space(root, 11);
  	return ret;
  }
  
@@@ -5602,6 -6612,38 +6603,38 @@@ int btrfs_start_delalloc_inodes(struct 
  	return 0;
  }
  
+ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+ {
+ 	struct btrfs_inode *binode;
+ 	struct inode *inode = NULL;
+ 
+ 	spin_lock(&root->fs_info->delalloc_lock);
+ 	while (!list_empty(&root->fs_info->delalloc_inodes)) {
+ 		binode = list_entry(root->fs_info->delalloc_inodes.next,
+ 				    struct btrfs_inode, delalloc_inodes);
+ 		inode = igrab(&binode->vfs_inode);
+ 		if (inode) {
+ 			list_move_tail(&binode->delalloc_inodes,
+ 				       &root->fs_info->delalloc_inodes);
+ 			break;
+ 		}
+ 
+ 		list_del_init(&binode->delalloc_inodes);
+ 		cond_resched_lock(&root->fs_info->delalloc_lock);
+ 	}
+ 	spin_unlock(&root->fs_info->delalloc_lock);
+ 
+ 	if (inode) {
+ 		write_inode_now(inode, 0);
+ 		if (delay_iput)
+ 			btrfs_add_delayed_iput(inode);
+ 		else
+ 			iput(inode);
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
  static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
  			 const char *symname)
  {
@@@ -5625,26 -6667,20 +6658,20 @@@
  	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
  		return -ENAMETOOLONG;
  
+ 	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ 	if (err)
+ 		return err;
  	/*
  	 * 2 items for inode item and ref
  	 * 2 items for dir items
  	 * 1 item for xattr if selinux is on
  	 */
- 	err = btrfs_reserve_metadata_space(root, 5);
- 	if (err)
- 		return err;
+ 	trans = btrfs_start_transaction(root, 5);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
  
- 	trans = btrfs_start_transaction(root, 1);
- 	if (!trans)
- 		goto out_fail;
  	btrfs_set_trans_block_group(trans, dir);
  
- 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- 	if (err) {
- 		err = -ENOSPC;
- 		goto out_unlock;
- 	}
- 
  	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
  				dentry->d_name.len,
  				dentry->d_parent->d_inode->i_ino, objectid,
@@@ -5716,8 -6752,6 +6743,6 @@@
  out_unlock:
  	nr = trans->blocks_used;
  	btrfs_end_transaction_throttle(trans, root);
- out_fail:
- 	btrfs_unreserve_metadata_space(root, 5);
  	if (drop_inode) {
  		inode_dec_link_count(inode);
  		iput(inode);
@@@ -5726,33 -6760,28 +6751,28 @@@
  	return err;
  }
  
- static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
- 			u64 alloc_hint, int mode, loff_t actual_len)
+ int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ 			      u64 start, u64 num_bytes, u64 min_size,
+ 			      loff_t actual_len, u64 *alloc_hint)
  {
  	struct btrfs_trans_handle *trans;
  	struct btrfs_root *root = BTRFS_I(inode)->root;
  	struct btrfs_key ins;
  	u64 cur_offset = start;
- 	u64 num_bytes = end - start;
  	int ret = 0;
- 	u64 i_size;
  
  	while (num_bytes > 0) {
- 		trans = btrfs_start_transaction(root, 1);
- 
- 		ret = btrfs_reserve_extent(trans, root, num_bytes,
- 					   root->sectorsize, 0, alloc_hint,
- 					   (u64)-1, &ins, 1);
- 		if (ret) {
- 			WARN_ON(1);
- 			goto stop_trans;
+ 		trans = btrfs_start_transaction(root, 3);
+ 		if (IS_ERR(trans)) {
+ 			ret = PTR_ERR(trans);
+ 			break;
  		}
  
- 		ret = btrfs_reserve_metadata_space(root, 3);
+ 		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+ 					   0, *alloc_hint, (u64)-1, &ins, 1);
  		if (ret) {
- 			btrfs_free_reserved_extent(root, ins.objectid,
- 						   ins.offset);
- 			goto stop_trans;
+ 			btrfs_end_transaction(trans, root);
+ 			break;
  		}
  
  		ret = insert_reserved_file_extent(trans, inode,
@@@ -5766,34 -6795,27 +6786,27 @@@
  
  		num_bytes -= ins.offset;
  		cur_offset += ins.offset;
- 		alloc_hint = ins.objectid + ins.offset;
+ 		*alloc_hint = ins.objectid + ins.offset;
  
  		inode->i_ctime = CURRENT_TIME;
  		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- 			(actual_len > inode->i_size) &&
- 			(cur_offset > inode->i_size)) {
- 
+ 		    (actual_len > inode->i_size) &&
+ 		    (cur_offset > inode->i_size)) {
  			if (cur_offset > actual_len)
- 				i_size  = actual_len;
+ 				i_size_write(inode, actual_len);
  			else
- 				i_size = cur_offset;
- 			i_size_write(inode, i_size);
- 			btrfs_ordered_update_i_size(inode, i_size, NULL);
+ 				i_size_write(inode, cur_offset);
+ 			i_size_write(inode, cur_offset);
+ 			btrfs_ordered_update_i_size(inode, cur_offset, NULL);
  		}
  
  		ret = btrfs_update_inode(trans, root, inode);
  		BUG_ON(ret);
  
  		btrfs_end_transaction(trans, root);
- 		btrfs_unreserve_metadata_space(root, 3);
  	}
  	return ret;
- 
- stop_trans:
- 	btrfs_end_transaction(trans, root);
- 	return ret;
- 
  }
  
  static long btrfs_fallocate(struct inode *inode, int mode,
@@@ -5826,8 -6848,7 +6839,7 @@@
  			goto out;
  	}
  
- 	ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
- 					  alloc_end - alloc_start);
+ 	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
  	if (ret)
  		goto out;
  
@@@ -5872,16 -6893,16 +6884,16 @@@
  		if (em->block_start == EXTENT_MAP_HOLE ||
  		    (cur_offset >= inode->i_size &&
  		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- 			ret = prealloc_file_range(inode,
- 						  cur_offset, last_byte,
- 						alloc_hint, mode, offset+len);
+ 			ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+ 							last_byte - cur_offset,
+ 							1 << inode->i_blkbits,
+ 							offset + len,
+ 							&alloc_hint);
  			if (ret < 0) {
  				free_extent_map(em);
  				break;
  			}
  		}
- 		if (em->block_start <= EXTENT_MAP_LAST_BYTE)
- 			alloc_hint = em->block_start;
  		free_extent_map(em);
  
  		cur_offset = last_byte;
@@@ -5893,8 -6914,7 +6905,7 @@@
  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
  			     &cached_state, GFP_NOFS);
  
- 	btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
- 				       alloc_end - alloc_start);
+ 	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
  out:
  	mutex_unlock(&inode->i_mutex);
  	return ret;
diff --combined fs/btrfs/super.c
index 2909a03e5230,574285c8cbd4..d34b2dfc9628
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@@ -498,7 -498,7 +498,7 @@@ int btrfs_sync_fs(struct super_block *s
  	btrfs_start_delalloc_inodes(root, 0);
  	btrfs_wait_ordered_extents(root, 0, 0);
  
- 	trans = btrfs_start_transaction(root, 1);
+ 	trans = btrfs_start_transaction(root, 0);
  	ret = btrfs_commit_transaction(trans, root);
  	return ret;
  }
@@@ -694,11 -694,11 +694,11 @@@ static int btrfs_remount(struct super_b
  		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
  			return -EINVAL;
  
- 		/* recover relocation */
- 		ret = btrfs_recover_relocation(root);
+ 		ret = btrfs_cleanup_fs_roots(root->fs_info);
  		WARN_ON(ret);
  
- 		ret = btrfs_cleanup_fs_roots(root->fs_info);
+ 		/* recover relocation */
+ 		ret = btrfs_recover_relocation(root);
  		WARN_ON(ret);
  
  		sb->s_flags &= ~MS_RDONLY;
@@@ -714,34 -714,18 +714,18 @@@ static int btrfs_statfs(struct dentry *
  	struct list_head *head = &root->fs_info->space_info;
  	struct btrfs_space_info *found;
  	u64 total_used = 0;
- 	u64 data_used = 0;
  	int bits = dentry->d_sb->s_blocksize_bits;
  	__be32 *fsid = (__be32 *)root->fs_info->fsid;
  
  	rcu_read_lock();
- 	list_for_each_entry_rcu(found, head, list) {
- 		if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
- 				    BTRFS_BLOCK_GROUP_RAID10|
- 				    BTRFS_BLOCK_GROUP_RAID1)) {
- 			total_used += found->bytes_used;
- 			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- 				data_used += found->bytes_used;
- 			else
- 				data_used += found->total_bytes;
- 		}
- 
- 		total_used += found->bytes_used;
- 		if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- 			data_used += found->bytes_used;
- 		else
- 			data_used += found->total_bytes;
- 	}
+ 	list_for_each_entry_rcu(found, head, list)
+ 		total_used += found->disk_used;
  	rcu_read_unlock();
  
  	buf->f_namelen = BTRFS_NAME_LEN;
  	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
  	buf->f_bfree = buf->f_blocks - (total_used >> bits);
- 	buf->f_bavail = buf->f_blocks - (data_used >> bits);
+ 	buf->f_bavail = buf->f_bfree;
  	buf->f_bsize = dentry->d_sb->s_blocksize;
  	buf->f_type = BTRFS_SUPER_MAGIC;
  
@@@ -832,14 -816,11 +816,14 @@@ static const struct file_operations btr
  };
  
  static struct miscdevice btrfs_misc = {
 -	.minor		= MISC_DYNAMIC_MINOR,
 +	.minor		= BTRFS_MINOR,
  	.name		= "btrfs-control",
  	.fops		= &btrfs_ctl_fops
  };
  
 +MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
 +MODULE_ALIAS("devname:btrfs-control");
 +
  static int btrfs_interface_init(void)
  {
  	return misc_register(&btrfs_misc);
diff --combined fs/btrfs/xattr.c
index 59acd3eb288a,007fae581a04..88ecbb215878
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@@ -154,15 -154,10 +154,10 @@@ int __btrfs_setxattr(struct btrfs_trans
  	if (trans)
  		return do_setxattr(trans, inode, name, value, size, flags);
  
- 	ret = btrfs_reserve_metadata_space(root, 2);
- 	if (ret)
- 		return ret;
+ 	trans = btrfs_start_transaction(root, 2);
+ 	if (IS_ERR(trans))
+ 		return PTR_ERR(trans);
  
- 	trans = btrfs_start_transaction(root, 1);
- 	if (!trans) {
- 		ret = -ENOMEM;
- 		goto out;
- 	}
  	btrfs_set_trans_block_group(trans, inode);
  
  	ret = do_setxattr(trans, inode, name, value, size, flags);
@@@ -174,7 -169,6 +169,6 @@@
  	BUG_ON(ret);
  out:
  	btrfs_end_transaction_throttle(trans, root);
- 	btrfs_unreserve_metadata_space(root, 2);
  	return ret;
  }
  
@@@ -282,7 -276,7 +276,7 @@@ err
   * List of handlers for synthetic system.* attributes.  All real ondisk
   * attributes are handled directly.
   */
 -struct xattr_handler *btrfs_xattr_handlers[] = {
 +const struct xattr_handler *btrfs_xattr_handlers[] = {
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
  	&btrfs_xattr_acl_access_handler,
  	&btrfs_xattr_acl_default_handler,
diff --combined include/linux/fs.h
index 9682d52d1507,10704f0086c8..85e823adcd4a
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -651,7 -651,6 +651,7 @@@ struct block_device 
  	int			bd_openers;
  	struct mutex		bd_mutex;	/* open/close mutex */
  	struct list_head	bd_inodes;
 +	void *			bd_claiming;
  	void *			bd_holder;
  	int			bd_holders;
  #ifdef CONFIG_SYSFS
@@@ -1281,12 -1280,10 +1281,12 @@@ static inline int lock_may_write(struc
  
  
  struct fasync_struct {
 -	int	magic;
 -	int	fa_fd;
 -	struct	fasync_struct	*fa_next; /* singly linked list */
 -	struct	file 		*fa_file;
 +	spinlock_t		fa_lock;
 +	int			magic;
 +	int			fa_fd;
 +	struct fasync_struct	*fa_next; /* singly linked list */
 +	struct file		*fa_file;
 +	struct rcu_head		fa_rcu;
  };
  
  #define FASYNC_MAGIC 0x4601
@@@ -1295,6 -1292,8 +1295,6 @@@
  extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
  /* can be called from interrupts */
  extern void kill_fasync(struct fasync_struct **, int, int);
 -/* only for net: no internal synchronization */
 -extern void __kill_fasync(struct fasync_struct *, int, int);
  
  extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
  extern int f_setown(struct file *filp, unsigned long arg, int force);
@@@ -1315,6 -1314,8 +1315,6 @@@ extern int send_sigurg(struct fown_stru
  extern struct list_head super_blocks;
  extern spinlock_t sb_lock;
  
 -#define sb_entry(list)  list_entry((list), struct super_block, s_list)
 -#define S_BIAS (1<<30)
  struct super_block {
  	struct list_head	s_list;		/* Keep this first */
  	dev_t			s_dev;		/* search index; _not_ kdev_t */
@@@ -1333,11 -1334,12 +1333,11 @@@
  	struct rw_semaphore	s_umount;
  	struct mutex		s_lock;
  	int			s_count;
 -	int			s_need_sync;
  	atomic_t		s_active;
  #ifdef CONFIG_SECURITY
  	void                    *s_security;
  #endif
 -	struct xattr_handler	**s_xattr;
 +	const struct xattr_handler **s_xattr;
  
  	struct list_head	s_inodes;	/* all inodes */
  	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
@@@ -1429,8 -1431,7 +1429,8 @@@ extern void dentry_unhash(struct dentr
   * VFS file helper functions.
   */
  extern int file_permission(struct file *, int);
 -
 +extern void inode_init_owner(struct inode *inode, const struct inode *dir,
 +			mode_t mode);
  /*
   * VFS FS_IOC_FIEMAP helper definitions.
   */
@@@ -1743,7 -1744,6 +1743,7 @@@ struct file_system_type 
  
  	struct lock_class_key s_lock_key;
  	struct lock_class_key s_umount_key;
 +	struct lock_class_key s_vfs_rename_key;
  
  	struct lock_class_key i_lock_key;
  	struct lock_class_key i_mutex_key;
@@@ -1781,6 -1781,8 +1781,6 @@@ extern int get_sb_pseudo(struct file_sy
  	const struct super_operations *ops, unsigned long,
  	struct vfsmount *mnt);
  extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 -int __put_super_and_need_restart(struct super_block *sb);
 -void put_super(struct super_block *sb);
  
  /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
  #define fops_get(fops) \
@@@ -1800,8 -1802,6 +1800,8 @@@ extern void drop_collected_mounts(struc
  extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
  			  struct vfsmount *);
  extern int vfs_statfs(struct dentry *, struct kstatfs *);
 +extern int freeze_super(struct super_block *super);
 +extern int thaw_super(struct super_block *super);
  
  extern int current_umask(void);
  
@@@ -2087,9 -2087,9 +2087,9 @@@ extern int __filemap_fdatawrite_range(s
  extern int filemap_fdatawrite_range(struct address_space *mapping,
  				loff_t start, loff_t end);
  
 -extern int vfs_fsync_range(struct file *file, struct dentry *dentry,
 -			   loff_t start, loff_t end, int datasync);
 -extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 +extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 +			   int datasync);
 +extern int vfs_fsync(struct file *file, int datasync);
  extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
  extern void sync_supers(void);
  extern void emergency_sync(void);
@@@ -2228,7 -2228,6 +2228,7 @@@ extern long do_splice_direct(struct fil
  
  extern void
  file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 +extern loff_t noop_llseek(struct file *file, loff_t offset, int origin);
  extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
  extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
  extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
@@@ -2251,10 -2250,15 +2251,15 @@@ static inline int xip_truncate_page(str
  #endif
  
  #ifdef CONFIG_BLOCK
+ struct bio;
+ typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
+ 			    loff_t file_offset);
+ void dio_end_io(struct bio *bio, int error);
+ 
  ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
  	struct block_device *bdev, const struct iovec *iov, loff_t offset,
  	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- 	int lock_type);
+ 	dio_submit_t submit_io,	int lock_type);
  
  enum {
  	/* need locking between buffered and direct access */
@@@ -2270,7 -2274,7 +2275,7 @@@ static inline ssize_t blockdev_direct_I
  	dio_iodone_t end_io)
  {
  	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- 				    nr_segs, get_block, end_io,
+ 				    nr_segs, get_block, end_io, NULL,
  				    DIO_LOCKING | DIO_SKIP_HOLES);
  }
  
@@@ -2280,7 -2284,7 +2285,7 @@@ static inline ssize_t blockdev_direct_I
  	dio_iodone_t end_io)
  {
  	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- 				nr_segs, get_block, end_io, 0);
+ 				    nr_segs, get_block, end_io, NULL, 0);
  }
  #endif
  
@@@ -2330,7 -2334,6 +2335,7 @@@ extern struct super_block *get_super(st
  extern struct super_block *get_active_super(struct block_device *bdev);
  extern struct super_block *user_get_super(dev_t);
  extern void drop_super(struct super_block *sb);
 +extern void iterate_supers(void (*)(struct super_block *, void *), void *);
  
  extern int dcache_dir_open(struct inode *, struct file *);
  extern int dcache_dir_close(struct inode *, struct file *);
@@@ -2364,8 -2367,6 +2369,8 @@@ extern void simple_release_fs(struct vf
  
  extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
  			loff_t *ppos, const void *from, size_t available);
 +extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 +		const void __user *from, size_t count);
  
  extern int simple_fsync(struct file *, struct dentry *, int);
  
diff --combined mm/filemap.c
index 35e12d186566,829ac9cdbd70..45a2d18df849
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -441,7 -441,7 +441,7 @@@ int add_to_page_cache_lru(struct page *
  	/*
  	 * Splice_read and readahead add shmem/tmpfs pages into the page cache
  	 * before shmem_readpage has a chance to mark them as SwapBacked: they
 -	 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
 +	 * need to go on the anon lru below, and mem_cgroup_cache_charge
  	 * (called in add_to_page_cache) needs to know where they're going too.
  	 */
  	if (mapping_cap_swap_backed(mapping))
@@@ -452,7 -452,7 +452,7 @@@
  		if (page_is_file_cache(page))
  			lru_cache_add_file(page);
  		else
 -			lru_cache_add_active_anon(page);
 +			lru_cache_add_anon(page);
  	}
  	return ret;
  }
@@@ -461,15 -461,9 +461,15 @@@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru
  #ifdef CONFIG_NUMA
  struct page *__page_cache_alloc(gfp_t gfp)
  {
 +	int n;
 +	struct page *page;
 +
  	if (cpuset_do_page_mem_spread()) {
 -		int n = cpuset_mem_spread_node();
 -		return alloc_pages_exact_node(n, gfp, 0);
 +		get_mems_allowed();
 +		n = cpuset_mem_spread_node();
 +		page = alloc_pages_exact_node(n, gfp, 0);
 +		put_mems_allowed();
 +		return page;
  	}
  	return alloc_pages(gfp, 0);
  }
@@@ -1105,12 -1099,6 +1105,12 @@@ page_not_up_to_date_locked
  		}
  
  readpage:
 +		/*
 +		 * A previous I/O error may have been due to temporary
 +		 * failures, eg. multipath errors.
 +		 * PG_error will be set again if readpage fails.
 +		 */
 +		ClearPageError(page);
  		/* Start the actual read. The read will unlock the page. */
  		error = mapping->a_ops->readpage(filp, page);
  
@@@ -1275,7 -1263,7 +1275,7 @@@ generic_file_aio_read(struct kiocb *ioc
  {
  	struct file *filp = iocb->ki_filp;
  	ssize_t retval;
- 	unsigned long seg;
+ 	unsigned long seg = 0;
  	size_t count;
  	loff_t *ppos = &iocb->ki_pos;
  
@@@ -1302,21 -1290,47 +1302,47 @@@
  				retval = mapping->a_ops->direct_IO(READ, iocb,
  							iov, pos, nr_segs);
  			}
- 			if (retval > 0)
+ 			if (retval > 0) {
  				*ppos = pos + retval;
- 			if (retval) {
+ 				count -= retval;
+ 			}
+ 
+ 			/*
+ 			 * Btrfs can have a short DIO read if we encounter
+ 			 * compressed extents, so if there was an error, or if
+ 			 * we've already read everything we wanted to, or if
+ 			 * there was a short read because we hit EOF, go ahead
+ 			 * and return.  Otherwise fallthrough to buffered io for
+ 			 * the rest of the read.
+ 			 */
+ 			if (retval < 0 || !count || *ppos >= size) {
  				file_accessed(filp);
  				goto out;
  			}
  		}
  	}
  
+ 	count = retval;
  	for (seg = 0; seg < nr_segs; seg++) {
  		read_descriptor_t desc;
+ 		loff_t offset = 0;
+ 
+ 		/*
+ 		 * If we did a short DIO read we need to skip the section of the
+ 		 * iov that we've already read data into.
+ 		 */
+ 		if (count) {
+ 			if (count > iov[seg].iov_len) {
+ 				count -= iov[seg].iov_len;
+ 				continue;
+ 			}
+ 			offset = count;
+ 			count = 0;
+ 		}
  
  		desc.written = 0;
- 		desc.arg.buf = iov[seg].iov_base;
- 		desc.count = iov[seg].iov_len;
+ 		desc.arg.buf = iov[seg].iov_base + offset;
+ 		desc.count = iov[seg].iov_len - offset;
  		if (desc.count == 0)
  			continue;
  		desc.error = 0;