From: Linus Torvalds Date: Thu, 27 May 2010 17:43:44 +0000 (-0700) Subject: Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable X-Git-Tag: v2.6.35-rc1~36 X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/105a048a4f35f7a74c7cc20b36dd83658b6ec232?hp=-c Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable * git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (27 commits) Btrfs: add more error checking to btrfs_dirty_inode Btrfs: allow unaligned DIO Btrfs: drop verbose enospc printk Btrfs: Fix block generation verification race Btrfs: fix preallocation and nodatacow checks in O_DIRECT Btrfs: avoid ENOSPC errors in btrfs_dirty_inode Btrfs: move O_DIRECT space reservation to btrfs_direct_IO Btrfs: rework O_DIRECT enospc handling Btrfs: use async helpers for DIO write checksumming Btrfs: don't walk around with task->state != TASK_RUNNING Btrfs: do aio_write instead of write Btrfs: add basic DIO read/write support direct-io: do not merge logically non-contiguous requests direct-io: add a hook for the fs to provide its own submit_bio function fs: allow short direct-io reads to be completed via buffered IO Btrfs: Metadata ENOSPC handling for balance Btrfs: Pre-allocate space for data relocation Btrfs: Metadata ENOSPC handling for tree log Btrfs: Metadata reservation for orphan inodes Btrfs: Introduce global metadata reservation ... --- 105a048a4f35f7a74c7cc20b36dd83658b6ec232 diff --combined fs/btrfs/extent-tree.c index c6a4f459ad76,6c14101506e1..b9080d71991a --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@@ -35,10 -35,9 +35,9 @@@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free); - static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 bytenr, u64 num_bytes, int alloc); + static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@@ -61,12 -60,6 +60,6 @@@ static int alloc_reserved_tree_block(st static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); - static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@@ -91,8 -84,12 +84,12 @@@ void btrfs_get_block_group(struct btrfs void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { - if (atomic_dec_and_test(&cache->count)) + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + WARN_ON(cache->reserved_pinned > 0); kfree(cache); + } } /* @@@ -319,7 -316,7 +316,7 @@@ static int caching_kthread(void *data exclude_super_stripes(extent_root, block_group); spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_super += block_group->bytes_super; + block_group->space_info->bytes_readonly += block_group->bytes_super; spin_unlock(&block_group->space_info->lock); last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@@ -507,6 -504,9 +504,9 @@@ static struct btrfs_space_info *__find_ struct list_head *head = &info->space_info; struct btrfs_space_info *found; + flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA; + rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags == flags) { @@@ -609,6 -609,113 +609,113 @@@ int btrfs_lookup_extent(struct btrfs_ro return ret; } + /* + * helper function to lookup reference count and flags of extent. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ + int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) + { + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { + #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + #else + BUG(); + #endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->node.ref_mod; + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); + out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; + out_free: + btrfs_free_path(path); + return ret; + } + /* * Back reference rules. Back refs have three main goals: * @@@ -1589,7 -1696,7 +1696,7 @@@ static void btrfs_issue_discard(struct u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@@ -1871,7 -1978,6 +1978,6 @@@ static int run_delayed_tree_ref(struct return ret; } - /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@@ -1891,32 -1997,14 +1997,14 @@@ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); if (insert_reserved) { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - - ret = pin_down_bytes(trans, root, NULL, - node->bytenr, node->num_bytes, - head->is_data, 1, &must_clean); - if (ret > 0) - mark_free = 1; - - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); BUG_ON(ret); } - if (mark_free) { - ret = btrfs_free_reserved_extent(root, - node->bytenr, - node->num_bytes); - BUG_ON(ret); - } } mutex_unlock(&head->mutex); return 0; @@@ -2347,6 -2435,8 +2435,8 @@@ int btrfs_cross_ref_exist(struct btrfs_ ret = 0; out: btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); return ret; } @@@ -2660,12 -2750,21 +2750,21 @@@ static int update_space_info(struct btr struct btrfs_space_info **space_info) { struct btrfs_space_info *found; + int i; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; found = __find_space_info(info, flags); if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; found->full = 0; spin_unlock(&found->lock); *space_info = found; @@@ -2675,18 -2774,20 +2774,20 @@@ if (!found) return -ENOMEM; - INIT_LIST_HEAD(&found->block_groups); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); - init_waitqueue_head(&found->flush_wait); - init_waitqueue_head(&found->allocate_wait); spin_lock_init(&found->lock); - found->flags = flags; + found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | + BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; found->bytes_used = bytes_used; + found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; - found->bytes_delalloc = 0; + found->bytes_may_use = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@@ -2711,19 -2812,6 +2812,6 @@@ static void set_avail_alloc_bits(struc } } - static void set_block_group_readonly(struct btrfs_block_group_cache *cache) - { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (!cache->ro) { - cache->space_info->bytes_readonly += cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->ro = 1; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - } - u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; @@@ -2752,722 -2840,946 +2840,946 @@@ return flags; } - static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) - { - struct btrfs_fs_info *info = root->fs_info; - u64 alloc_profile; - - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } - - return btrfs_reduce_alloc_profile(root, data); - } - - void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) + static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 alloc_target; - - alloc_target = btrfs_get_alloc_profile(root, 1); - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - alloc_target); + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits & + root->fs_info->data_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits & + root->fs_info->system_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits & + root->fs_info->metadata_alloc_profile; + return btrfs_reduce_alloc_profile(root, flags); } - static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) + static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { - u64 num_bytes; - int level; - - level = BTRFS_MAX_LEVEL - 2; - /* - * NOTE: these calculations are absolutely the worst possible case. - * This assumes that _every_ item we insert will require a new leaf, and - * that the tree has grown to its maximum level size. - */ + u64 flags; - /* - * for every item we insert we could insert both an extent item and a - * extent ref item. Then for ever item we insert, we will need to cow - * both the original leaf, plus the leaf to the left and right of it. - * - * Unless we are talking about the extent root, then we just want the - * number of items * 2, since we just need the extent item plus its ref. - */ - if (root == root->fs_info->extent_root) - num_bytes = num_items * 2; + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; else - num_bytes = (num_items + (2 * num_items)) * 3; + flags = BTRFS_BLOCK_GROUP_METADATA; - /* - * num_bytes is total number of leaves we could need times the leaf - * size, and then for every leaf we could end up cow'ing 2 nodes per - * level, down to the leaf level. - */ - num_bytes = (num_bytes * root->leafsize) + - (num_bytes * (level * 2)) * root->nodesize; + return get_alloc_profile(root, flags); + } - return num_bytes; + void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) + { + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_DATA); } /* - * Unreserve metadata space for delalloc. If we have less reserved credits than - * we have extents, this function does nothing. + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. */ - int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) + int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; + struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 used; + int ret = 0, committed = 0; - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); + data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) + goto alloc; - spin_lock(&meta_sinfo->lock); - spin_lock(&BTRFS_I(inode)->accounting_lock); - if (BTRFS_I(inode)->reserved_extents <= - BTRFS_I(inode)->outstanding_extents) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - spin_unlock(&meta_sinfo->lock); - return 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); + again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + + data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + + data_sinfo->bytes_may_use; + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; - BTRFS_I(inode)->reserved_extents -= num_items; - BUG_ON(BTRFS_I(inode)->reserved_extents < 0); + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; - if (meta_sinfo->bytes_delalloc < num_bytes) { - bug = true; - meta_sinfo->bytes_delalloc = 0; - } else { - meta_sinfo->bytes_delalloc -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); + data_sinfo->force_alloc = 1; + spin_unlock(&data_sinfo->lock); + alloc: + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); - BUG_ON(bug); + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + bytes + 2 * 1024 * 1024, + alloc_target, 0); + btrfs_end_transaction(trans, root); + if (ret < 0) + return ret; - return 0; - } + if (!data_sinfo) { + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } + goto again; + } + spin_unlock(&data_sinfo->lock); - static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) - { - u64 thresh; + /* commit the current transaction and try again */ + if (!committed && !root->fs_info->open_ioctl_trans) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } - thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use; + #if 0 /* I hope we never need this code again, just in case */ + printk(KERN_ERR "no space left, need %llu, %llu bytes_used, " + "%llu bytes_reserved, " "%llu bytes_pinned, " + "%llu bytes_readonly, %llu may use %llu total\n", + (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); + #endif + return -ENOSPC; + } + data_sinfo->bytes_may_use += bytes; + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); - thresh = meta_sinfo->total_bytes - thresh; - thresh *= 80; - do_div(thresh, 100); - if (thresh <= meta_sinfo->bytes_delalloc) - meta_sinfo->force_delalloc = 1; - else - meta_sinfo->force_delalloc = 0; + return 0; } - struct async_flush { - struct btrfs_root *root; - struct btrfs_space_info *info; - struct btrfs_work work; - }; - - static noinline void flush_delalloc_async(struct btrfs_work *work) + /* + * called when we are clearing an delalloc extent from the + * inode's io_tree or there was an error for whatever reason + * after calling btrfs_check_data_free_space + */ + void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { - struct async_flush *async; - struct btrfs_root *root; - struct btrfs_space_info *info; - - async = container_of(work, struct async_flush, work); - root = async->root; - info = async->info; - - btrfs_start_delalloc_inodes(root, 0); - wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0, 0); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_space_info *data_sinfo; - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - kfree(async); + data_sinfo = BTRFS_I(inode)->space_info; + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + spin_unlock(&data_sinfo->lock); } - static void wait_on_flush(struct btrfs_space_info *info) + static void force_metadata_allocation(struct btrfs_fs_info *info) { - DEFINE_WAIT(wait); - u64 used; - - while (1) { - prepare_to_wait(&info->flush_wait, &wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&info->lock); - if (!info->flushing) { - spin_unlock(&info->lock); - break; - } + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; - used = info->bytes_used + info->bytes_reserved + - info->bytes_pinned + info->bytes_readonly + - info->bytes_super + info->bytes_root + - info->bytes_may_use + info->bytes_delalloc; - if (used < info->total_bytes) { - spin_unlock(&info->lock); - break; - } - spin_unlock(&info->lock); - schedule(); + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; } - finish_wait(&info->flush_wait, &wait); + rcu_read_unlock(); } - static void flush_delalloc(struct btrfs_root *root, - struct btrfs_space_info *info) + static int should_alloc_chunk(struct btrfs_space_info *sinfo, + u64 alloc_bytes) { - struct async_flush *async; - bool wait = false; - - spin_lock(&info->lock); - - if (!info->flushing) - info->flushing = 1; - else - wait = true; - - spin_unlock(&info->lock); - - if (wait) { - wait_on_flush(info); - return; - } - - async = kzalloc(sizeof(*async), GFP_NOFS); - if (!async) - goto flush; - - async->root = root; - async->info = info; - async->work.func = flush_delalloc_async; + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; - btrfs_queue_worker(&root->fs_info->enospc_workers, - &async->work); - wait_on_flush(info); - return; + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes + 256 * 1024 * 1024 < num_bytes) + return 0; - flush: - btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0, 0); + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes < div_factor(num_bytes, 8)) + return 0; - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); + return 1; } - static int maybe_allocate_chunk(struct btrfs_root *root, - struct btrfs_space_info *info) + static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force) { - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; - struct btrfs_trans_handle *trans; - bool wait = false; + struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; int ret = 0; - u64 min_metadata; - u64 free_space; - free_space = btrfs_super_total_bytes(disk_super); - /* - * we allow the metadata to grow to a max of either 10gb or 5% of the - * space in the volume. - */ - min_metadata = min((u64)10 * 1024 * 1024 * 1024, - div64_u64(free_space * 5, 100)); - if (info->total_bytes >= min_metadata) { - spin_unlock(&info->lock); - return 0; - } + mutex_lock(&fs_info->chunk_mutex); - if (info->full) { - spin_unlock(&info->lock); - return 0; + flags = btrfs_reduce_alloc_profile(extent_root, flags); + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); } + BUG_ON(!space_info); - if (!info->allocating_chunk) { - info->force_alloc = 1; - info->allocating_chunk = 1; - } else { - wait = true; + spin_lock(&space_info->lock); + if (space_info->force_alloc) + force = 1; + if (space_info->full) { + spin_unlock(&space_info->lock); + goto out; } - spin_unlock(&info->lock); - - if (wait) { - wait_event(info->allocate_wait, - !info->allocating_chunk); - return 1; + if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { + spin_unlock(&space_info->lock); + goto out; } + spin_unlock(&space_info->lock); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); } - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 4096 + 2 * 1024 * 1024, - info->flags, 0); - btrfs_end_transaction(trans, root); + ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); if (ret) - goto out; + space_info->full = 1; + else + ret = 1; + space_info->force_alloc = 0; + spin_unlock(&space_info->lock); out: - spin_lock(&info->lock); - info->allocating_chunk = 0; - spin_unlock(&info->lock); - wake_up(&info->allocate_wait); + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; + } - if (ret) + static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 num_bytes) + { + int ret; + int end_trans = 0; + + if (sinfo->full) return 0; - return 1; + + spin_lock(&sinfo->lock); + ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); + spin_unlock(&sinfo->lock); + if (!ret) + return 0; + + if (!trans) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + end_trans = 1; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, + get_alloc_profile(root, sinfo->flags), 0); + + if (end_trans) + btrfs_end_transaction(trans, root); + + return ret == 1 ? 1 : 0; } /* - * Reserve metadata space for delalloc. + * shrink metadata reservation for delalloc */ - int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) + static int shrink_delalloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 to_reclaim) + { + struct btrfs_block_rsv *block_rsv; + u64 reserved; + u64 max_reclaim; + u64 reclaimed = 0; + int pause = 1; + int ret; + + block_rsv = &root->fs_info->delalloc_block_rsv; + spin_lock(&block_rsv->lock); + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0) + return 0; + + max_reclaim = min(reserved, to_reclaim); + + while (1) { + ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); + if (!ret) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(pause); + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; + } else { + pause = 1; + } + + spin_lock(&block_rsv->lock); + if (reserved > block_rsv->reserved) + reclaimed = reserved - block_rsv->reserved; + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0 || reclaimed >= max_reclaim) + break; + + if (trans && trans->transaction->blocked) + return -EAGAIN; + } + return reclaimed >= to_reclaim; + } + + static int should_retry_reserve(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int flushed = 0; - int force_delalloc; + struct btrfs_space_info *space_info = block_rsv->space_info; + int ret; - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + if ((*retries) > 2) + return -ENOSPC; - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); - again: - spin_lock(&meta_sinfo->lock); + ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); + if (ret) + return 1; - force_delalloc = meta_sinfo->force_delalloc; + if (trans && trans->transaction->in_commit) + return -ENOSPC; - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + ret = shrink_delalloc(trans, root, num_bytes); + if (ret) + return ret; - if (!flushed) - meta_sinfo->bytes_delalloc += num_bytes; + spin_lock(&space_info->lock); + if (space_info->bytes_pinned < num_bytes) + ret = 1; + spin_unlock(&space_info->lock); + if (ret) + return -ENOSPC; - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + (*retries)++; - if (used > meta_sinfo->total_bytes) { - flushed++; + if (trans) + return -EAGAIN; - if (flushed == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - flushed++; + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + + return 1; + } + + static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) + { + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 unused; + int ret = -ENOSPC; + + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + + if (unused < space_info->total_bytes) + unused = space_info->total_bytes - unused; + else + unused = 0; + + if (unused >= num_bytes) { + if (block_rsv->priority >= 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; } else { - spin_unlock(&meta_sinfo->lock); + if ((unused + block_rsv->reserved) * + block_rsv->priority >= + (num_bytes + block_rsv->reserved) * 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; + } } + } + spin_unlock(&space_info->lock); - if (flushed == 2) { - filemap_flush(inode->i_mapping); - goto again; - } else if (flushed == 3) { - flush_delalloc(root, meta_sinfo); - goto again; + return ret; + } + + static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root) + { + struct btrfs_block_rsv *block_rsv; + if (root->ref_cows) + block_rsv = trans->block_rsv; + else + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &root->fs_info->empty_block_rsv; + + return block_rsv; + } + + static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) + { + int ret = -ENOSPC; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; + } + + static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int update_size) + { + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); + } + + void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) + { + struct btrfs_space_info *space_info = block_rsv->space_info; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) + num_bytes = block_rsv->size; + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (num_bytes > 0) { + if (dest) { + block_rsv_add_bytes(dest, num_bytes, 0); + } else { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= num_bytes; + spin_unlock(&space_info->lock); } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_delalloc -= num_bytes; - spin_unlock(&meta_sinfo->lock); - printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->outstanding_extents, - BTRFS_I(inode)->reserved_extents); - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; } + } - BTRFS_I(inode)->reserved_extents += num_items; - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); + static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes) + { + int ret; - if (!flushed && force_delalloc) - filemap_flush(inode->i_mapping); + ret = block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + block_rsv_add_bytes(dst, num_bytes, 1); return 0; } - /* - * unreserve num_items number of items worth of metadata space. This needs to - * be paired with btrfs_reserve_metadata_space. - * - * NOTE: if you have the option, run this _AFTER_ you do a - * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref - * oprations which will result in more used metadata, so we want to make sure we - * can do that without issue. - */ - int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) + void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + atomic_set(&rsv->usage, 1); + rsv->priority = 6; + INIT_LIST_HEAD(&rsv->list); + } + + struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) + { + struct btrfs_block_rsv *block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_target; - bool bug = false; - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; - num_bytes = calculate_bytes_needed(root, num_items); + btrfs_init_block_rsv(block_rsv); - spin_lock(&meta_sinfo->lock); - if (meta_sinfo->bytes_may_use < num_bytes) { - bug = true; - meta_sinfo->bytes_may_use = 0; - } else { - meta_sinfo->bytes_may_use -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); + alloc_target = btrfs_get_alloc_profile(root, 0); + block_rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); - BUG_ON(bug); + return block_rsv; + } - return 0; + void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) + { + if (rsv && atomic_dec_and_test(&rsv->usage)) { + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (!rsv->durable) + kfree(rsv); + } } /* - * Reserve some metadata space for use. We'll calculate the worste case number - * of bytes that would be needed to modify num_items number of items. If we - * have space, fantastic, if not, you get -ENOSPC. Please call - * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of - * items you reserved, since whatever metadata you needed should have already - * been allocated. - * - * This will commit the transaction to make more space if we don't have enough - * metadata space. THe only time we don't do this is if we're reserving space - * inside of a transaction, then we will just return -ENOSPC and it is the - * callers responsibility to handle it properly. + * make the block_rsv struct be able to capture freed space. + * the captured space will re-add to the the block_rsv struct + * after transaction commit */ - int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) + void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int retries = 0; + block_rsv->durable = 1; + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); + mutex_unlock(&fs_info->durable_block_rsv_mutex); + } - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); + int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) + { + int ret; - num_bytes = calculate_bytes_needed(root, num_items); + if (num_bytes == 0) + return 0; again: - spin_lock(&meta_sinfo->lock); + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); + if (ret > 0) + goto again; + + return ret; + } - if (!retries) - meta_sinfo->bytes_may_use += num_bytes; + int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor) + { + u64 num_bytes = 0; + int commit_trans = 0; + int ret = -ENOSPC; - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + if (!block_rsv) + return 0; - if (used > meta_sinfo->total_bytes) { - retries++; - if (retries == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - retries++; - } else { - spin_unlock(&meta_sinfo->lock); - } + spin_lock(&block_rsv->lock); + if (min_factor > 0) + num_bytes = div_factor(block_rsv->size, min_factor); + if (min_reserved > num_bytes) + num_bytes = min_reserved; - if (retries == 2) { - flush_delalloc(root, meta_sinfo); - goto again; + if (block_rsv->reserved >= num_bytes) { + ret = 0; + } else { + num_bytes -= block_rsv->reserved; + if (block_rsv->durable && + block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) + commit_trans = 1; + } + spin_unlock(&block_rsv->lock); + if (!ret) + return 0; + + if (block_rsv->refill_used) { + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_may_use -= num_bytes; - spin_unlock(&meta_sinfo->lock); + } - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; + if (commit_trans) { + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + return 0; } - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); - return 0; + return -ENOSPC; + } + + int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes) + { + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + } + + void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) + { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + if (global_rsv->full || global_rsv == block_rsv || + block_rsv->space_info != global_rsv->space_info) + global_rsv = NULL; + block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); } /* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. + * helper to calculate size of global block reservation. + * the desired value is sum of space used by extent tree, + * checksum tree and root tree */ - int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) + static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *data_sinfo; - u64 used; - int ret = 0, committed = 0, flushed = 0; + struct btrfs_space_info *sinfo; + u64 num_bytes; + u64 meta_used; + u64 data_used; + int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + #if 0 + /* + * per tree used space accounting can be inaccuracy, so we + * can't rely on it. + */ + spin_lock(&fs_info->extent_root->accounting_lock); + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item); + spin_unlock(&fs_info->extent_root->accounting_lock); - /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + spin_lock(&fs_info->csum_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->csum_root->root_item); + spin_unlock(&fs_info->csum_root->accounting_lock); - data_sinfo = BTRFS_I(inode)->space_info; - if (!data_sinfo) - goto alloc; + spin_lock(&fs_info->tree_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->tree_root->root_item); + spin_unlock(&fs_info->tree_root->accounting_lock); + #endif + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + spin_lock(&sinfo->lock); + data_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); - again: - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + - data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + - data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + - data_sinfo->bytes_super; + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&sinfo->lock); + meta_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; + num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * + csum_size * 2; + num_bytes += div64_u64(data_used + meta_used, 50); - if (!flushed) { - spin_unlock(&data_sinfo->lock); - flush_delalloc(root, data_sinfo); - flushed = 1; - goto again; - } + if (num_bytes * 3 > meta_used) + num_bytes = div64_u64(meta_used, 3); - /* - * if we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; + return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); + } - data_sinfo->force_alloc = 1; - spin_unlock(&data_sinfo->lock); - alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; + static void update_global_block_rsv(struct btrfs_fs_info *fs_info) + { + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, - alloc_target, 0); - btrfs_end_transaction(trans, root); - if (ret) - return ret; + num_bytes = calc_global_metadata_size(fs_info); - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } - goto again; - } - spin_unlock(&data_sinfo->lock); + spin_lock(&block_rsv->lock); + spin_lock(&sinfo->lock); - /* commit the current transaction and try again */ - if (!committed && !root->fs_info->open_ioctl_trans) { - committed = 1; - trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; - } + block_rsv->size = num_bytes; - printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" - ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use " - "%llu total\n", (unsigned long long)bytes, - (unsigned long long)data_sinfo->bytes_delalloc, - (unsigned long long)data_sinfo->bytes_used, - (unsigned long long)data_sinfo->bytes_reserved, - (unsigned long long)data_sinfo->bytes_pinned, - (unsigned long long)data_sinfo->bytes_readonly, - (unsigned long long)data_sinfo->bytes_may_use, - (unsigned long long)data_sinfo->total_bytes); - return -ENOSPC; + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly; + + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + block_rsv->reserved += num_bytes; + sinfo->bytes_reserved += num_bytes; } - data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; - spin_unlock(&data_sinfo->lock); - return 0; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + sinfo->bytes_reserved -= num_bytes; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } + #if 0 + printk(KERN_INFO"global block rsv size %llu reserved %llu\n", + block_rsv->size, block_rsv->reserved); + #endif + spin_unlock(&sinfo->lock); + spin_unlock(&block_rsv->lock); } - /* - * if there was an error for whatever reason after calling - * btrfs_check_data_free_space, call this so we can cleanup the counters. - */ - void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes) + static void init_global_block_rsv(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *data_sinfo; + struct btrfs_space_info *space_info; - /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + fs_info->chunk_block_rsv.priority = 10; - data_sinfo = BTRFS_I(inode)->space_info; - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; - spin_unlock(&data_sinfo->lock); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->global_block_rsv.priority = 10; + fs_info->global_block_rsv.refill_used = 1; + fs_info->delalloc_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.priority = 10; + + fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; + fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); + + btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); + + update_global_block_rsv(fs_info); } - /* called when we are adding a delalloc extent to the inode's io_tree */ - void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) + static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *data_sinfo; + block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + WARN_ON(fs_info->delalloc_block_rsv.size > 0); + WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + } - /* get the space info for where this inode will be storing its data */ - data_sinfo = BTRFS_I(inode)->space_info; + static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) + { + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + 3 * num_items; + } - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_delalloc += bytes; + int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items, int *retries) + { + u64 num_bytes; + int ret; - /* - * we are adding a delalloc extent without calling - * btrfs_check_data_free_space first. This happens on a weird - * writepage condition, but shouldn't hurt our accounting - */ - if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { - data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; - BTRFS_I(inode)->reserved_bytes = 0; - } else { - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; - } + if (num_items == 0 || root->fs_info->chunk_root == root) + return 0; - spin_unlock(&data_sinfo->lock); + num_bytes = calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, + num_bytes, retries); + if (!ret) { + trans->bytes_reserved += num_bytes; + trans->block_rsv = &root->fs_info->trans_block_rsv; + } + return ret; } - /* called when we are clearing an delalloc extent from the inode's io_tree */ - void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) + void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_space_info *info; + if (!trans->bytes_reserved) + return; - info = BTRFS_I(inode)->space_info; + BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->bytes_reserved = 0; + } - spin_lock(&info->lock); - info->bytes_delalloc -= bytes; - spin_unlock(&info->lock); + int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode) + { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; + + /* + * one for deleting orphan item, one for updating inode and + * two for calling btrfs_truncate_inode_items. + * + * btrfs_truncate_inode_items is a delete operation, it frees + * more space than it uses in most cases. So two units of + * metadata space should be enough for calling it many times. + * If all of the metadata space is used, we can commit + * transaction and use space it freed. + */ + u64 num_bytes = calc_trans_metadata_size(root, 4); + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } - static void force_metadata_allocation(struct btrfs_fs_info *info) + void btrfs_orphan_release_metadata(struct inode *inode) { - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes = calc_trans_metadata_size(root, 4); + btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); + } - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = 1; - } - rcu_read_unlock(); + int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) + { + struct btrfs_root *root = pending->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; + /* + * two for root back/forward refs, two for directory entries + * and one for root of the snapshot. + */ + u64 num_bytes = calc_trans_metadata_size(root, 5); + dst_rsv->space_info = src_rsv->space_info; + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } - static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force) + static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) { - struct btrfs_space_info *space_info; - struct btrfs_fs_info *fs_info = extent_root->fs_info; - u64 thresh; - int ret = 0; + return num_bytes >>= 3; + } - mutex_lock(&fs_info->chunk_mutex); + int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) + { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve; + int nr_extents; + int retries = 0; + int ret; - flags = btrfs_reduce_alloc_profile(extent_root, flags); + if (btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); - space_info = __find_space_info(extent_root->fs_info, flags); - if (!space_info) { - ret = update_space_info(extent_root->fs_info, flags, - 0, 0, &space_info); - BUG_ON(ret); + num_bytes = ALIGN(num_bytes, root->sectorsize); + again: + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; + if (nr_extents > BTRFS_I(inode)->reserved_extents) { + nr_extents -= BTRFS_I(inode)->reserved_extents; + to_reserve = calc_trans_metadata_size(root, nr_extents); + } else { + nr_extents = 0; + to_reserve = 0; } - BUG_ON(!space_info); - spin_lock(&space_info->lock); - if (space_info->force_alloc) - force = 1; - if (space_info->full) { - spin_unlock(&space_info->lock); - goto out; + to_reserve += calc_csum_metadata_size(inode, num_bytes); + ret = reserve_metadata_bytes(block_rsv, to_reserve); + if (ret) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, + &retries); + if (ret > 0) + goto again; + return ret; } - thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 8); - if (!force && - (space_info->bytes_used + space_info->bytes_pinned + - space_info->bytes_reserved + alloc_bytes) < thresh) { - spin_unlock(&space_info->lock); - goto out; - } - spin_unlock(&space_info->lock); + BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_unlock(&BTRFS_I(inode)->accounting_lock); - /* - * if we're doing a data chunk, go ahead and make sure that - * we keep a reasonable number of metadata chunks allocated in the - * FS as well. - */ - if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { - fs_info->data_chunk_allocations++; - if (!(fs_info->data_chunk_allocations % - fs_info->metadata_ratio)) - force_metadata_allocation(fs_info); + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + if (block_rsv->size > 512 * 1024 * 1024) + shrink_delalloc(NULL, root, to_reserve); + + return 0; + } + + void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) + { + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free; + int nr_extents; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents < BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; + BTRFS_I(inode)->reserved_extents -= nr_extents; + } else { + nr_extents = 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); - ret = btrfs_alloc_chunk(trans, extent_root, flags); - spin_lock(&space_info->lock); + to_free = calc_csum_metadata_size(inode, num_bytes); + if (nr_extents > 0) + to_free += calc_trans_metadata_size(root, nr_extents); + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); + } + + int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) + { + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes); if (ret) - space_info->full = 1; - space_info->force_alloc = 0; - spin_unlock(&space_info->lock); - out: - mutex_unlock(&extent_root->fs_info->chunk_mutex); - return ret; + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; + } + + void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) + { + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); } static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free) + u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; + int factor; u64 total = num_bytes; u64 old_val; u64 byte_in_group; @@@ -3486,6 -3798,12 +3798,12 @@@ cache = btrfs_lookup_block_group(info, bytenr); if (!cache) return -1; + if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@@ -3498,31 -3816,24 +3816,24 @@@ old_val += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; cache->space_info->bytes_reserved -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; - cache->space_info->bytes_used -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - if (mark_free) { - int ret; - ret = btrfs_discard_extent(root, bytenr, - num_bytes); - WARN_ON(ret); - - ret = btrfs_add_free_space(cache, bytenr, - num_bytes); - WARN_ON(ret); - } + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@@ -3546,18 -3857,10 +3857,10 @@@ static u64 first_logical_byte(struct bt return bytenr; } - /* - * this function must be called within transaction - */ - int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int reserved) + static int pin_down_extent(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; @@@ -3569,28 -3872,68 +3872,68 @@@ spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - btrfs_put_block_group(cache); + set_extent_dirty(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + return 0; + } + + /* + * this function must be called within transaction + */ + int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) + { + struct btrfs_block_group_cache *cache; - set_extent_dirty(fs_info->pinned_extents, - bytenr, bytenr + num_bytes - 1, GFP_NOFS); + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + pin_down_extent(root, cache, bytenr, num_bytes, reserved); + + btrfs_put_block_group(cache); return 0; } - static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) + /* + * update size of reserved extents. this function may return -EAGAIN + * if 'reserve' is true or 'sinfo' is false. + */ + static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - cache->reserved += num_bytes; - cache->space_info->bytes_reserved += num_bytes; + int ret = 0; + if (sinfo) { + struct btrfs_space_info *space_info = cache->space_info; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + } + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); } else { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + if (reserve) + cache->reserved += num_bytes; + else + cache->reserved -= num_bytes; + } + spin_unlock(&cache->lock); } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - return 0; + return ret; } int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, @@@ -3621,6 -3964,8 +3964,8 @@@ fs_info->pinned_extents = &fs_info->freed_extents[0]; up_write(&fs_info->extent_commit_sem); + + update_global_block_rsv(fs_info); return 0; } @@@ -3647,14 -3992,21 +3992,21 @@@ static int unpin_extent_range(struct bt btrfs_add_free_space(cache, start, len); } + start += len; + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; + if (cache->ro) { + cache->space_info->bytes_readonly += len; + } else if (cache->reserved_pinned > 0) { + len = min(len, cache->reserved_pinned); + cache->reserved_pinned -= len; + cache->space_info->bytes_reserved += len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - - start += len; } if (cache) @@@ -3667,8 -4019,11 +4019,11 @@@ int btrfs_finish_extent_commit(struct b { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *next_rsv; u64 start; u64 end; + int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@@ -3689,59 -4044,30 +4044,30 @@@ cond_resched(); } - return ret; - } - - static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean) - { - int err = 0; - struct extent_buffer *buf; - - if (is_data) - goto pinit; - - /* - * discard is sloooow, and so triggering discards on - * individual btree blocks isn't a good plan. Just - * pin everything in discard mode. - */ - if (btrfs_test_opt(root, DISCARD)) - goto pinit; + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_for_each_entry_safe(block_rsv, next_rsv, + &fs_info->durable_block_rsv_list, list) { - buf = btrfs_find_tree_block(root, bytenr, num_bytes); - if (!buf) - goto pinit; + idx = trans->transid & 0x1; + if (block_rsv->freed[idx] > 0) { + block_rsv_add_bytes(block_rsv, + block_rsv->freed[idx], 0); + block_rsv->freed[idx] = 0; + } + if (atomic_read(&block_rsv->usage) == 0) { + btrfs_block_rsv_release(root, block_rsv, (u64)-1); - /* we can reuse a block if it hasn't been written - * and it is from this transaction. We can't - * reuse anything from the tree log root because - * it has tiny sub-transactions. - */ - if (btrfs_buffer_uptodate(buf, 0) && - btrfs_try_tree_lock(buf)) { - u64 header_owner = btrfs_header_owner(buf); - u64 header_transid = btrfs_header_generation(buf); - if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_transid == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - *must_clean = buf; - return 1; + if (block_rsv->freed[0] == 0 && + block_rsv->freed[1] == 0) { + list_del_init(&block_rsv->list); + kfree(block_rsv); + } + } else { + btrfs_block_rsv_release(root, block_rsv, 0); } - btrfs_tree_unlock(buf); } - free_extent_buffer(buf); - pinit: - if (path) - btrfs_set_path_blocking(path); - /* unlocks the pinned mutex */ - btrfs_pin_extent(root, bytenr, num_bytes, reserved); + mutex_unlock(&fs_info->durable_block_rsv_mutex); - BUG_ON(err < 0); return 0; } @@@ -3902,9 -4228,6 +4228,6 @@@ static int __btrfs_free_extent(struct b BUG_ON(ret); } } else { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - if (found_extent) { BUG_ON(is_data && refs_to_drop != extent_data_ref_count(root, path, iref)); @@@ -3917,31 -4240,11 +4240,11 @@@ } } - ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, 0, &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); - /* - * it is going to be very rare for someone to be waiting - * on the block we're freeing. del_items might need to - * schedule, so rather than get fancy, just force it - * to blocking here - */ - if (must_clean) - btrfs_set_lock_blocking(must_clean); - ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } - if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); @@@ -3951,8 -4254,7 +4254,7 @@@ (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } - ret = update_block_group(trans, root, bytenr, num_bytes, 0, - mark_free); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); BUG_ON(ret); } btrfs_free_path(path); @@@ -3960,7 -4262,7 +4262,7 @@@ } /* - * when we free an extent, it is possible (and likely) that we free the last + * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. @@@ -3972,7 -4274,7 +4274,7 @@@ static noinline int check_ref_cleanup(s struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct rb_node *node; - int ret; + int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@@ -4024,17 -4326,99 +4326,99 @@@ list_del_init(&head->cluster); spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->extent_op, - head->must_insert_reserved); - BUG_ON(ret); + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - return 0; + return ret; out: spin_unlock(&delayed_refs->lock); return 0; } + void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) + { + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_group_cache *cache = NULL; + int ret; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } + + if (!last_ref) + return; + + block_rsv = get_block_rsv(trans, root); + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + BUG_ON(block_rsv->space_info != cache->space_info); + + if (btrfs_header_generation(buf) == trans->transid) { + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, root, buf->start); + if (!ret) + goto pin; + } + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(root, cache, buf->start, buf->len, 1); + goto pin; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + ret = update_reserved_bytes(cache, buf->len, 0, 0); + if (ret == -EAGAIN) { + /* block group became read-only */ + update_reserved_bytes(cache, buf->len, 0, 1); + goto out; + } + + ret = 1; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + block_rsv->reserved += buf->len; + ret = 0; + } + spin_unlock(&block_rsv->lock); + + if (ret) { + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_reserved -= buf->len; + spin_unlock(&cache->space_info->lock); + } + goto out; + } + pin: + if (block_rsv->durable && !cache->ro) { + ret = 0; + spin_lock(&cache->lock); + if (!cache->ro) { + cache->reserved_pinned += buf->len; + ret = 1; + } + spin_unlock(&cache->lock); + + if (ret) { + spin_lock(&block_rsv->lock); + block_rsv->freed[trans->transid & 0x1] += buf->len; + spin_unlock(&block_rsv->lock); + } + } + out: + btrfs_put_block_group(cache); + } + int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@@ -4056,8 -4440,6 +4440,6 @@@ parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); - ret = check_ref_cleanup(trans, root, bytenr); - BUG_ON(ret); } else { ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, parent, root_objectid, owner, @@@ -4067,21 -4449,6 +4449,6 @@@ return ret; } - int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level) - { - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) - blocksize; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - - return btrfs_free_extent(trans, root, bytenr, blocksize, - parent, root_objectid, level, 0); - } - static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@@ -4134,6 -4501,22 +4501,22 @@@ wait_block_group_cache_done(struct btrf return 0; } + static int get_block_group_index(struct btrfs_block_group_cache *cache) + { + int index; + if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + index = 0; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + index = 1; + else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + index = 2; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + index = 3; + else + index = 4; + return index; + } + enum btrfs_loop_type { LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, @@@ -4155,7 -4538,6 +4538,6 @@@ static noinline int find_free_extent(st u64 num_bytes, u64 empty_size, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - u64 exclude_start, u64 exclude_nr, int data) { int ret = 0; @@@ -4168,6 -4550,7 +4550,7 @@@ struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + int index = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; @@@ -4237,6 -4620,7 +4620,7 @@@ ideal_cache btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { + index = get_block_group_index(block_group); goto have_block_group; } } else if (block_group) { @@@ -4245,7 -4629,8 +4629,8 @@@ } search: down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups, list) { + list_for_each_entry(block_group, &space_info->block_groups[index], + list) { u64 offset; int cached; @@@ -4436,23 -4821,22 +4821,22 @@@ checks goto loop; } - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + ret = update_reserved_bytes(block_group, num_bytes, 1, + (data & BTRFS_BLOCK_GROUP_DATA)); + if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= block_group->key.objectid && - search_start < (block_group->key.objectid + - block_group->key.offset)) - goto have_block_group; goto loop; } + /* we are all good, lets return */ ins->objectid = search_start; ins->offset = num_bytes; @@@ -4460,18 -4844,18 +4844,18 @@@ btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - - update_reserved_extents(block_group, num_bytes, 1); - - /* we are all good, lets return */ break; loop: failed_cluster_refill = false; failed_alloc = false; + BUG_ON(index != get_block_group_index(block_group)); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + goto search; + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for * for them to make caching progress. Also * determine the best possible bg to cache @@@ -4485,6 -4869,7 +4869,7 @@@ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { + index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; loop++; @@@ -4567,31 -4952,30 +4952,30 @@@ static void dump_space_info(struct btrf int dump_block_groups) { struct btrfs_block_group_cache *cache; + int index = 0; spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - - info->bytes_super), + info->bytes_readonly), (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" - "\n", + printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + "reserved=%llu, may_use=%llu, readonly=%llu\n", (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_used, (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_reserved, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_root, - (unsigned long long)info->bytes_super, - (unsigned long long)info->bytes_reserved); + (unsigned long long)info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) return; down_read(&info->groups_sem); - list_for_each_entry(cache, &info->block_groups, list) { + again: + list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used " "%llu pinned %llu reserved\n", @@@ -4603,6 -4987,8 +4987,8 @@@ btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; up_read(&info->groups_sem); } @@@ -4628,9 -5014,8 +5014,8 @@@ again WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, ins, - trans->alloc_exclude_start, - trans->alloc_exclude_nr, data); + search_start, search_end, hint_byte, + ins, data); if (ret == -ENOSPC && num_bytes > min_alloc_size) { num_bytes = num_bytes >> 1; @@@ -4668,7 -5053,7 +5053,7 @@@ int btrfs_free_reserved_extent(struct b ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - update_reserved_extents(cache, len, 0); + update_reserved_bytes(cache, len, 0, 1); btrfs_put_block_group(cache); return ret; @@@ -4731,8 -5116,7 +5116,7 @@@ static int alloc_reserved_file_extent(s btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@@ -4792,8 -5176,7 +5176,7 @@@ static int alloc_reserved_tree_block(st btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@@ -4869,73 -5252,14 +5252,14 @@@ int btrfs_alloc_logged_file_extent(stru put_caching_control(caching_ctl); } - update_reserved_extents(block_group, ins->offset, 1); + ret = update_reserved_bytes(block_group, ins->offset, 1, 1); + BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); return ret; } - /* - * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * - * returns 0 if everything worked, non-zero otherwise. - */ - static int alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 empty_size, u64 hint_byte, u64 search_end, - struct btrfs_key *ins) - { - int ret; - u64 flags = 0; - - ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); - if (ret) - return ret; - - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins->objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); - - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; - - ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); - BUG_ON(ret); - } - - if (root_objectid == root->root_key.objectid) { - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) + num_bytes; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - } - return ret; - } - struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@@ -4974,8 -5298,45 +5298,45 @@@ return buf; } + static struct btrfs_block_rsv * + use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize) + { + struct btrfs_block_rsv *block_rsv; + int ret; + + block_rsv = get_block_rsv(trans, root); + + if (block_rsv->size == 0) { + ret = reserve_metadata_bytes(block_rsv, blocksize); + if (ret) + return ERR_PTR(ret); + return block_rsv; + } + + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); + + return ERR_PTR(-ENOSPC); + } + + static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) + { + block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_release_bytes(block_rsv, NULL, 0); + } + /* - * helper function to allocate a block for a given tree + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, @@@ -4985,18 -5346,53 +5346,53 @@@ u64 hint, u64 empty_size) { struct btrfs_key ins; - int ret; + struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + u64 flags = 0; + int ret; + - ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, - key, level, empty_size, hint, (u64)-1, &ins); + block_rsv = use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); + + ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, + empty_size, hint, (u64)-1, &ins, 0); if (ret) { - BUG_ON(ret > 0); + unuse_block_rsv(block_rsv, blocksize); return ERR_PTR(ret); } buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); + BUG_ON(IS_ERR(buf)); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ins.offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); + BUG_ON(ret); + } return buf; } @@@ -5321,7 -5717,7 +5717,7 @@@ static noinline int walk_up_proc(struc struct btrfs_path *path, struct walk_control *wc) { - int ret = 0; + int ret; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; @@@ -5399,13 -5795,11 +5795,11 @@@ btrfs_header_owner(path->nodes[level + 1])); } - ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, - root->root_key.objectid, level, 0); - BUG_ON(ret); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; - return ret; + return 0; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@@ -5483,7 -5877,8 +5877,8 @@@ static noinline int walk_up_tree(struc * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ - int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) + int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@@ -5501,7 -5896,9 +5896,9 @@@ wc = kzalloc(sizeof(*wc), GFP_NOFS); BUG_ON(!wc); - trans = btrfs_start_transaction(tree_root, 1); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); @@@ -5589,22 -5986,16 +5986,16 @@@ } BUG_ON(wc->level == 0); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { + if (btrfs_should_end_transaction(trans, tree_root)) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); BUG_ON(ret); - btrfs_end_transaction(trans, tree_root); - trans = btrfs_start_transaction(tree_root, 1); - } else { - unsigned long update; - update = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (update) - btrfs_run_delayed_refs(trans, tree_root, - update); + btrfs_end_transaction_throttle(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; } } btrfs_release_path(root, path); @@@ -5632,7 -6023,7 +6023,7 @@@ kfree(root); } out: - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction_throttle(trans, tree_root); kfree(wc); btrfs_free_path(path); return err; @@@ -7228,48 -7619,80 +7619,80 @@@ static u64 update_block_group_flags(str return flags; } - static int __alloc_chunk_for_shrink(struct btrfs_root *root, - struct btrfs_block_group_cache *shrink_block_group, - int force) + static int set_block_group_ro(struct btrfs_block_group_cache *cache) { - struct btrfs_trans_handle *trans; - u64 new_alloc_flags; - u64 calc; + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + int ret = -ENOSPC; - spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) + - shrink_block_group->reserved > 0) { - spin_unlock(&shrink_block_group->lock); + if (cache->ro) + return 0; - trans = btrfs_start_transaction(root, 1); - spin_lock(&shrink_block_group->lock); + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + + if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + + sinfo->bytes_may_use + sinfo->bytes_readonly + + cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + sinfo->bytes_reserved += cache->reserved_pinned; + cache->reserved_pinned = 0; + cache->ro = 1; + ret = 0; + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + return ret; + } - new_alloc_flags = update_block_group_flags(root, - shrink_block_group->flags); - if (new_alloc_flags != shrink_block_group->flags) { - calc = - btrfs_block_group_used(&shrink_block_group->item); - } else { - calc = shrink_block_group->key.offset; - } - spin_unlock(&shrink_block_group->lock); + int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) - do_chunk_alloc(trans, root->fs_info->extent_root, - calc + 2 * 1024 * 1024, new_alloc_flags, force); + { + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; - btrfs_end_transaction(trans, root); - } else - spin_unlock(&shrink_block_group->lock); - return 0; - } + BUG_ON(cache->ro); + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); - int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group) + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + + ret = set_block_group_ro(cache); + if (!ret) + goto out; + alloc_flags = get_alloc_profile(root, cache->space_info->flags); + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + if (ret < 0) + goto out; + ret = set_block_group_ro(cache); + out: + btrfs_end_transaction(trans, root); + return ret; + } + int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - __alloc_chunk_for_shrink(root, group, 1); - set_block_group_readonly(group); + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + cache->ro = 0; + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); return 0; } @@@ -7436,17 -7859,33 +7859,33 @@@ int btrfs_free_block_groups(struct btrf */ synchronize_rcu(); + release_global_block_rsv(info); + while(!list_empty(&info->space_info)) { space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } list_del(&space_info->list); kfree(space_info); } return 0; } + static void __link_block_group(struct btrfs_space_info *space_info, + struct btrfs_block_group_cache *cache) + { + int index = get_block_group_index(cache); + + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + } + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@@ -7468,10 -7907,8 +7907,8 @@@ while (1) { ret = find_first_block_group(root, path, &key); - if (ret > 0) { - ret = 0; - goto error; - } + if (ret > 0) + break; if (ret != 0) goto error; @@@ -7480,7 -7917,7 +7917,7 @@@ cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) { ret = -ENOMEM; - break; + goto error; } atomic_set(&cache->count, 1); @@@ -7537,20 -7974,36 +7974,36 @@@ BUG_ON(ret); cache->space_info = space_info; spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups); - up_write(&space_info->groups_sem); + __link_block_group(space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_readonly(cache); + set_block_group_ro(cache); + } + + list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { + if (!(get_alloc_profile(root, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, &space_info->block_groups[3], list) + set_block_group_ro(cache); + list_for_each_entry(cache, &space_info->block_groups[4], list) + set_block_group_ro(cache); } + + init_global_block_rsv(info); ret = 0; error: btrfs_free_path(path); @@@ -7611,12 -8064,10 +8064,10 @@@ int btrfs_make_block_group(struct btrfs BUG_ON(ret); spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&cache->space_info->groups_sem); - list_add_tail(&cache->list, &cache->space_info->block_groups); - up_write(&cache->space_info->groups_sem); + __link_block_group(cache->space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); diff --combined fs/btrfs/inode.c index d601629b85d1,2551b8018399..fa6ccc1bfe2a --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@@ -252,6 -252,7 +252,7 @@@ static noinline int cow_file_range_inli inline_len, compressed_size, compressed_pages); BUG_ON(ret); + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@@ -414,6 -415,7 +415,7 @@@ again trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { @@@ -439,7 -441,6 +441,6 @@@ start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); btrfs_end_transaction(trans, root); @@@ -697,6 -698,38 +698,38 @@@ retry return 0; } + static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) + { + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; + } + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@@ -734,6 -767,7 +767,7 @@@ static noinline int cow_file_range(stru trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; actual_end = min_t(u64, isize, end + 1); @@@ -753,7 -787,6 +787,6 @@@ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); @@@ -769,29 -802,7 +802,7 @@@ BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); - - read_lock(&BTRFS_I(inode)->extent_tree.lock); - em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, - start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&BTRFS_I(inode)->extent_tree.lock); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { @@@ -1174,6 -1185,13 +1185,13 @@@ out_check num_bytes, num_bytes, type); BUG_ON(ret); + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, cur_offset, + num_bytes); + BUG_ON(ret); + } + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | @@@ -1226,15 -1244,13 +1244,13 @@@ static int run_delalloc_range(struct in } static int btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) + struct extent_state *orig, u64 split) { + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_inc(&BTRFS_I(inode)->outstanding_extents); return 0; } @@@ -1252,10 -1268,7 +1268,7 @@@ static int btrfs_merge_extent_hook(stru if (!(other->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_dec(&BTRFS_I(inode)->outstanding_extents); return 0; } @@@ -1264,8 -1277,8 +1277,8 @@@ * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ - static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) + static int btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* @@@ -1273,17 -1286,18 +1286,18 @@@ * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_delalloc_reserve_space(root, inode, end - start + 1); + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else + atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += end - start + 1; - root->fs_info->delalloc_bytes += end - start + 1; + BTRFS_I(inode)->delalloc_bytes += len; + root->fs_info->delalloc_bytes += len; if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@@ -1297,45 -1311,32 +1311,32 @@@ * extent_io.c clear_bit_hook, see set_bit_hook for why */ static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long bits) + struct extent_state *state, int *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - if (bits & EXTENT_DO_ACCOUNTING) { - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); - } + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else if (!(*bits & EXTENT_DO_ACCOUNTING)) + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + if (*bits & EXTENT_DO_ACCOUNTING) + btrfs_delalloc_release_metadata(inode, len); + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - if (state->end - state->start + 1 > - root->fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs warning: delalloc account " - "%llu %llu\n", - (unsigned long long) - state->end - state->start + 1, - (unsigned long long) - root->fs_info->delalloc_bytes); - btrfs_delalloc_free_space(root, inode, (u64)-1); - root->fs_info->delalloc_bytes = 0; - BTRFS_I(inode)->delalloc_bytes = 0; - } else { - btrfs_delalloc_free_space(root, inode, - state->end - - state->start + 1); - root->fs_info->delalloc_bytes -= state->end - - state->start + 1; - BTRFS_I(inode)->delalloc_bytes -= state->end - - state->start + 1; - } + root->fs_info->delalloc_bytes -= len; + BTRFS_I(inode)->delalloc_bytes -= len; + if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); @@@ -1384,7 -1385,8 +1385,8 @@@ int btrfs_merge_bio_hook(struct page *p */ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@@ -1403,7 -1405,8 +1405,8 @@@ * are inserted into the btree */ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; return btrfs_map_bio(root, rw, bio, mirror_num, 1); @@@ -1414,7 -1417,8 +1417,8 @@@ * on write, or reading the csums from the tree before a read */ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@@ -1439,7 -1443,8 +1443,8 @@@ /* we're doing a write, do the async checksumming */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - bio_flags, __btrfs_submit_bio_start, + bio_flags, bio_offset, + __btrfs_submit_bio_start, __btrfs_submit_bio_done); } @@@ -1520,6 -1525,7 +1525,7 @@@ again goto again; } + BUG(); btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); out: @@@ -1650,7 -1656,7 +1656,7 @@@ static int insert_reserved_file_extent( static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; @@@ -1668,9 -1674,10 +1674,10 @@@ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); } goto out; } @@@ -1680,6 -1687,8 +1687,8 @@@ 0, &cached_state, GFP_NOFS); trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; @@@ -1711,12 -1720,13 +1720,13 @@@ add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - /* this also removes the ordered extent from the tree */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); out: + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@@ -1838,7 -1848,7 +1848,7 @@@ static int btrfs_io_failed_hook(struct BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, - failrec->bio_flags); + failrec->bio_flags, 0); return 0; } @@@ -1992,33 -2002,197 +2002,197 @@@ void btrfs_run_delayed_iputs(struct btr up_read(&root->fs_info->cleanup_work_sem); } + /* + * calculate extra metadata reservation when snapshotting a subvolume + * contains orphan files. + */ + void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) + { + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + + root = pending->root; + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + block_rsv = root->orphan_block_rsv; + + /* orphan block reservation for the snapshot */ + num_bytes = block_rsv->size; + + /* + * after the snapshot is created, COWing tree blocks may use more + * space than it frees. So we should make sure there is enough + * reserved space. + */ + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes += block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + } + + *bytes_to_reserve += num_bytes; + } + + void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) + { + struct btrfs_root *root = pending->root; + struct btrfs_root *snap = pending->snap; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + int ret; + + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + /* refill source subvolume's orphan block reservation */ + block_rsv = root->orphan_block_rsv; + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes = block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + root->orphan_block_rsv, + num_bytes); + BUG_ON(ret); + } + + /* setup orphan block reservation for the snapshot */ + block_rsv = btrfs_alloc_block_rsv(snap); + BUG_ON(!block_rsv); + + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); + snap->orphan_block_rsv = block_rsv; + + num_bytes = root->orphan_block_rsv->size; + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + block_rsv, num_bytes); + BUG_ON(ret); + + #if 0 + /* insert orphan item for the snapshot */ + WARN_ON(!root->orphan_item_inserted); + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + snap->root_key.objectid); + BUG_ON(ret); + snap->orphan_item_inserted = 1; + #endif + } + + enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, + }; + + /* + * This is called in transaction commmit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure. + */ + void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) + { + int ret; + + if (!list_empty(&root->orphan_list) || + root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) + return; + + if (root->orphan_item_inserted && + btrfs_root_refs(&root->root_item) > 0) { + ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + root->orphan_item_inserted = 0; + } + + if (root->orphan_block_rsv) { + WARN_ON(root->orphan_block_rsv->size > 0); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + } + } + /* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. + * + * NOTE: caller of this function should reserve 5 units of metadata for + * this function. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; + struct btrfs_block_rsv *block_rsv = NULL; + int reserve = 0; + int insert = 0; + int ret; + + if (!root->orphan_block_rsv) { + block_rsv = btrfs_alloc_block_rsv(root); + BUG_ON(!block_rsv); + } - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); + if (!root->orphan_block_rsv) { + root->orphan_block_rsv = block_rsv; + } else if (block_rsv) { + btrfs_free_block_rsv(root, block_rsv); + block_rsv = NULL; + } - /* already on the orphan list, we're good */ - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + #if 0 + /* + * For proper ENOSPC handling, we should do orphan + * cleanup when mounting. But this introduces backward + * compatibility issue. + */ + if (!xchg(&root->orphan_item_inserted, 1)) + insert = 2; + else + insert = 1; + #endif + insert = 1; + } else { + WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved); } - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 1; + reserve = 1; + } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (block_rsv) + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* - * insert an orphan item to track this unlinked/truncated file - */ - ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + /* grab metadata reservation from transaction handle */ + if (reserve) { + ret = btrfs_orphan_reserve_metadata(trans, inode); + BUG_ON(ret); + } - return ret; + /* insert an orphan item to track this unlinked/truncated file */ + if (insert >= 1) { + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } + + /* insert an orphan item to track subvolume contains orphan files */ + if (insert >= 2) { + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + return 0; } /* @@@ -2028,26 -2202,31 +2202,31 @@@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + int delete_item = 0; + int release_rsv = 0; int ret = 0; - spin_lock(&root->list_lock); - - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + list_del_init(&BTRFS_I(inode)->i_orphan); + delete_item = 1; } - list_del_init(&BTRFS_I(inode)->i_orphan); - if (!trans) { - spin_unlock(&root->list_lock); - return 0; + if (BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 0; + release_rsv = 1; } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (trans && delete_item) { + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } - ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + if (release_rsv) + btrfs_orphan_release_metadata(inode); - return ret; + return 0; } /* @@@ -2064,7 -2243,7 +2243,7 @@@ void btrfs_orphan_cleanup(struct btrfs_ struct inode *inode; int ret = 0, nr_unlink = 0, nr_truncate = 0; - if (!xchg(&root->clean_orphans, 0)) + if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) return; path = btrfs_alloc_path(); @@@ -2117,16 -2296,15 +2296,15 @@@ found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) - break; + BUG_ON(IS_ERR(inode)); /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it */ - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); /* * if this is a bad inode, means we actually succeeded in @@@ -2135,7 -2313,7 +2313,7 @@@ * do a destroy_inode */ if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@@ -2153,13 -2331,23 +2331,23 @@@ /* this will do delete_inode and everything for us */ iput(inode); } + btrfs_free_path(path); + + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; + + if (root->orphan_block_rsv) + btrfs_block_rsv_release(root, root->orphan_block_rsv, + (u64)-1); + + if (root->orphan_block_rsv || root->orphan_item_inserted) { + trans = btrfs_join_transaction(root, 1); + btrfs_end_transaction(trans, root); + } if (nr_unlink) printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); if (nr_truncate) printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); - - btrfs_free_path(path); } /* @@@ -2478,103 -2666,276 +2666,276 @@@ out return ret; } - static int btrfs_unlink(struct inode *dir, struct dentry *dentry) + /* helper to check if there is any shared block in the path */ + static int check_path_shared(struct btrfs_root *root, + struct btrfs_path *path) { - struct btrfs_root *root; - struct btrfs_trans_handle *trans; - struct inode *inode = dentry->d_inode; + struct extent_buffer *eb; + int level; int ret; - unsigned long nr = 0; - - root = BTRFS_I(dir)->root; - - /* - * 5 items for unlink inode - * 1 for orphan - */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; + u64 refs; - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 6); - return PTR_ERR(trans); + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + eb = path->nodes[level]; + if (!btrfs_block_can_be_shared(root, eb)) + continue; + ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, + &refs, NULL); + if (refs > 1) + return 1; } - - btrfs_set_trans_block_group(trans, dir); - - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); - - ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, - dentry->d_name.name, dentry->d_name.len); - - if (inode->i_nlink == 0) - ret = btrfs_orphan_add(trans, inode); - - nr = trans->blocks_used; - - btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 6); - btrfs_btree_balance_dirty(root, nr); - return ret; + return 0; } - int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, u64 objectid, - const char *name, int name_len) + /* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space. + * so in enospc case, we should make sure they will free space before + * allowing them to use the global metadata reservation. + */ + static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, + struct dentry *dentry) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; - struct extent_buffer *leaf; + struct btrfs_inode_ref *ref; struct btrfs_dir_item *di; - struct btrfs_key key; + struct inode *inode = dentry->d_inode; u64 index; + int check_link = 1; + int err = -ENOSPC; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + trans = btrfs_start_transaction(root, 10); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; - di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, - name, name_len, -1); - BUG_ON(!di || IS_ERR(di)); + if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return ERR_PTR(-ENOSPC); - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); - WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); - btrfs_release_path(root, path); + /* check if there is someone else holds reference */ + if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) + return ERR_PTR(-ENOSPC); - ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, - objectid, root->root_key.objectid, - dir->i_ino, &index, name, name_len); - if (ret < 0) { - BUG_ON(ret != -ENOENT); - di = btrfs_search_dir_index_item(root, path, dir->i_ino, - name, name_len); - BUG_ON(!di || IS_ERR(di)); + if (atomic_read(&inode->i_count) > 2) + return ERR_PTR(-ENOSPC); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(root, path); - index = key.offset; + if (xchg(&root->fs_info->enospc_unlink, 1)) + return ERR_PTR(-ENOSPC); + + path = btrfs_alloc_path(); + if (!path) { + root->fs_info->enospc_unlink = 0; + return ERR_PTR(-ENOMEM); } - di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - index, name, name_len, -1); - BUG_ON(!di || IS_ERR(di)); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + root->fs_info->enospc_unlink = 0; + return trans; + } - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); - WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); - btrfs_release_path(root, path); + path->skip_locking = 1; + path->search_commit_root = 1; - btrfs_i_size_write(dir, dir->i_size - name_len * 2); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(dir)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + if (ret == 0 && S_ISREG(inode->i_mode)) { + ret = btrfs_lookup_file_extent(trans, root, path, + inode->i_ino, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret == 0); + if (check_path_shared(root, path)) + goto out; + btrfs_release_path(root, path); + } + + if (!check_link) { + err = 0; + goto out; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + if (di) { + if (check_path_shared(root, path)) + goto out; + } else { + err = 0; + goto out; + } + btrfs_release_path(root, path); + + ref = btrfs_lookup_inode_ref(trans, root, path, + dentry->d_name.name, dentry->d_name.len, + inode->i_ino, dir->i_ino, 0); + if (IS_ERR(ref)) { + err = PTR_ERR(ref); + goto out; + } + BUG_ON(!ref); + if (check_path_shared(root, path)) + goto out; + index = btrfs_inode_ref_index(path->nodes[0], ref); + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + BUG_ON(ret == -ENOENT); + if (check_path_shared(root, path)) + goto out; + + err = 0; + out: + btrfs_free_path(path); + if (err) { + btrfs_end_transaction(trans, root); + root->fs_info->enospc_unlink = 0; + return ERR_PTR(err); + } + + trans->block_rsv = &root->fs_info->global_block_rsv; + return trans; + } + + static void __unlink_end_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) + { + if (trans->block_rsv == &root->fs_info->global_block_rsv) { + BUG_ON(!root->fs_info->enospc_unlink); + root->fs_info->enospc_unlink = 0; + } + btrfs_end_transaction_throttle(trans, root); + } + + static int btrfs_unlink(struct inode *dir, struct dentry *dentry) + { + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_set_trans_block_group(trans, dir); + + btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + BUG_ON(ret); + + if (inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + } + + nr = trans->blocks_used; + __unlink_end_trans(trans, root); + btrfs_btree_balance_dirty(root, nr); + return ret; + } + + int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len) + { + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + dir->i_ino, &index, name, name_len); + if (ret < 0) { + BUG_ON(ret != -ENOENT); + di = btrfs_search_dir_index_item(root, path, dir->i_ino, + name, name_len); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + index = key.offset; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); dir->i_sb->s_dirt = 1; @@@ -2587,7 -2948,6 +2948,6 @@@ static int btrfs_rmdir(struct inode *di { struct inode *inode = dentry->d_inode; int err = 0; - int ret; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; unsigned long nr = 0; @@@ -2596,15 -2956,9 +2956,9 @@@ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; - ret = btrfs_reserve_metadata_space(root, 5); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 5); + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) return PTR_ERR(trans); - } btrfs_set_trans_block_group(trans, dir); @@@ -2627,12 -2981,9 +2981,9 @@@ btrfs_i_size_write(inode, 0); out: nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 5); + __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); - if (ret && !err) - err = ret; return err; } @@@ -3029,6 -3380,7 +3380,7 @@@ out if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + BUG_ON(ret); } btrfs_free_path(path); return err; @@@ -3056,11 -3408,7 +3408,7 @@@ static int btrfs_truncate_page(struct a if ((offset & (blocksize - 1)) == 0) goto out; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) goto out; @@@ -3068,8 -3416,7 +3416,7 @@@ again: page = grab_cache_page(mapping, index); if (!page) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; } @@@ -3132,8 -3479,7 +3479,7 @@@ out_unlock: if (ret) - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@@ -3145,7 -3491,7 +3491,7 @@@ int btrfs_cont_expand(struct inode *ino struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; u64 hole_start = (inode->i_size + mask) & ~mask; @@@ -3183,11 -3529,11 +3529,11 @@@ u64 hint_byte = 0; hole_size = last_byte - cur_offset; - err = btrfs_reserve_metadata_space(root, 2); - if (err) + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); break; - - trans = btrfs_start_transaction(root, 1); + } btrfs_set_trans_block_group(trans, inode); err = btrfs_drop_extents(trans, inode, cur_offset, @@@ -3205,14 -3551,15 +3551,15 @@@ last_byte - 1, 0); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 2); } free_extent_map(em); + em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } + free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); return err; @@@ -3239,11 -3586,10 +3586,10 @@@ static int btrfs_setattr_size(struct in } } - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_orphan_add(trans, inode); @@@ -3251,7 -3597,6 +3597,6 @@@ nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 1); btrfs_btree_balance_dirty(root, nr); if (attr->ia_size > inode->i_size) { @@@ -3264,8 -3609,11 +3609,11 @@@ i_size_write(inode, attr->ia_size); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + BUG_ON(!trans->block_rsv); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@@ -3345,10 -3693,21 +3693,21 @@@ void btrfs_delete_inode(struct inode *i btrfs_i_size_write(inode, 0); while (1) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + trans->block_rsv = root->orphan_block_rsv; + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@@ -3356,6 -3715,7 +3715,7 @@@ btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); + } if (ret == 0) { @@@ -3596,40 -3956,10 +3956,10 @@@ again return 0; } - static noinline void init_btrfs_i(struct inode *inode) - { - struct btrfs_inode *bi = BTRFS_I(inode); - - bi->generation = 0; - bi->sequence = 0; - bi->last_trans = 0; - bi->last_sub_trans = 0; - bi->logged_trans = 0; - bi->delalloc_bytes = 0; - bi->reserved_bytes = 0; - bi->disk_i_size = 0; - bi->flags = 0; - bi->index_cnt = (u64)-1; - bi->last_unlink_trans = 0; - bi->ordered_data_close = 0; - bi->force_compress = 0; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->log_mutex); - } - static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; inode->i_ino = args->ino; - init_btrfs_i(inode); BTRFS_I(inode)->root = args->root; btrfs_set_inode_space_info(args->root, inode); return 0; @@@ -3692,8 -4022,6 +4022,6 @@@ static struct inode *new_simple_dir(str if (!inode) return ERR_PTR(-ENOMEM); - init_btrfs_i(inode); - BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); BTRFS_I(inode)->dummy_inode = 1; @@@ -3950,7 -4278,7 +4278,7 @@@ int btrfs_write_inode(struct inode *ino struct btrfs_trans_handle *trans; int ret = 0; - if (root->fs_info->btree_inode == inode) + if (BTRFS_I(inode)->dummy_inode) return 0; if (wbc->sync_mode == WB_SYNC_ALL) { @@@ -3971,10 -4299,38 +4299,38 @@@ void btrfs_dirty_inode(struct inode *in { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + int ret; + + if (BTRFS_I(inode)->dummy_inode) + return; trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); - btrfs_update_inode(trans, root, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret == -ENOSPC) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %ld\n", + inode->i_ino, PTR_ERR(trans)); + } + return; + } + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %d\n", + inode->i_ino, ret); + } + } + } btrfs_end_transaction(trans, root); } @@@ -4092,7 -4448,6 +4448,6 @@@ static struct inode *btrfs_new_inode(st * btrfs_get_inode_index_count has an explanation for the magic * number */ - init_btrfs_i(inode); BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; @@@ -4121,7 -4476,16 +4476,7 @@@ if (ret != 0) goto fail; - inode->i_uid = current_fsuid(); - - if (dir && (dir->i_mode & S_ISGID)) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = objectid; inode_set_bytes(inode, 0); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@@ -4247,26 -4611,21 +4602,21 @@@ static int btrfs_mknod(struct inode *di if (!new_valid_dev(rdev)) return -EINVAL; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@@ -4295,13 -4654,11 +4645,11 @@@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - fail: - btrfs_unreserve_metadata_space(root, 5); + btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); return err; } @@@ -4311,32 -4668,26 +4659,26 @@@ static int btrfs_create(struct inode *d struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int err; int drop_inode = 0; + int err; unsigned long nr = 0; u64 objectid; u64 index = 0; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, @@@ -4368,8 -4719,6 +4710,6 @@@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@@ -4396,21 -4745,21 +4736,21 @@@ static int btrfs_link(struct dentry *ol if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; - /* - * 1 item for inode ref - * 2 items for dir items - */ - err = btrfs_reserve_metadata_space(root, 3); - if (err) - return err; - btrfs_inc_nlink(inode); err = btrfs_set_inode_index(dir, &index); if (err) goto fail; - trans = btrfs_start_transaction(root, 1); + /* + * 1 item for inode ref + * 2 items for dir items + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto fail; + } btrfs_set_trans_block_group(trans, dir); atomic_inc(&inode->i_count); @@@ -4429,7 -4778,6 +4769,6 @@@ nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: - btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@@ -4449,28 -4797,20 +4788,20 @@@ static int btrfs_mkdir(struct inode *di u64 index = 0; unsigned long nr = 1; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 items for inode and ref * 2 items for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_fail; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@@ -4510,9 -4850,6 +4841,6 @@@ out_fail: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - - out_unlock: - btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@@ -4770,6 -5107,7 +5098,7 @@@ again } flush_dcache_page(page); } else if (create && PageUptodate(page)) { + WARN_ON(1); if (!trans) { kunmap(page); free_extent_map(em); @@@ -4866,11 -5204,651 +5195,651 @@@ out return em; } + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) + { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + + trans = btrfs_join_transaction(root, 0); + if (!trans) + return ERR_PTR(-ENOMEM); + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) { + em = ERR_PTR(ret); + goto out; + } + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + + em->start = start; + em->orig_start = em->start; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + em = ERR_PTR(ret); + } + out: + btrfs_end_transaction(trans, root); + return em; + } + + /* + * returns 1 when the nocow is safe, < 1 on error, 0 if the + * block must be cow'd + */ + static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 len) + { + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_bytenr; + u64 backref_offset; + u64 extent_end; + u64 num_bytes; + int slot; + int found_type; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + offset, 0); + if (ret < 0) + goto out; + + slot = path->slots[0]; + if (ret == 1) { + if (slot == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + slot--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (found_type != BTRFS_FILE_EXTENT_REG && + found_type != BTRFS_FILE_EXTENT_PREALLOC) { + /* not a regular extent, must cow */ + goto out; + } + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + backref_offset = btrfs_file_extent_offset(leaf, fi); + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end < offset + len) { + /* extent doesn't include our full range, must cow */ + goto out; + } + + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out; + + /* + * look for other files referencing this extent, if we + * find any we must cow + */ + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + key.offset - backref_offset, disk_bytenr)) + goto out; + + /* + * adjust disk_bytenr and num_bytes to cover just the bytes + * in this extent we are about to write. If there + * are any csums in that range we have to cow in order + * to keep the csums correct + */ + disk_bytenr += backref_offset; + disk_bytenr += offset - key.offset; + num_bytes = min(offset + len, extent_end) - offset; + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out; + /* + * all of the above have passed, it is safe to overwrite this extent + * without cow + */ + ret = 1; + out: + btrfs_free_path(path); + return ret; + } + + static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start = iblock << inode->i_blkbits; + u64 len = bh_result->b_size; + struct btrfs_trans_handle *trans; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + return -ENOTBLK; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + /* DIO will do one hole at a time, so just unlock a sector */ + unlock_extent(&BTRFS_I(inode)->io_tree, start, + start + root->sectorsize - 1, GFP_NOFS); + return 0; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) { + len = em->len - (start - em->start); + goto map; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + int ret; + u64 block_start; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + /* + * we're not going to log anything, but we do need + * to make sure the current transaction stays open + * while we look for nocow cross refs + */ + trans = btrfs_join_transaction(root, 0); + if (!trans) + goto must_cow; + + if (can_nocow_odirect(trans, inode, start, len) == 1) { + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + btrfs_end_transaction(trans, root); + if (ret) { + free_extent_map(em); + return ret; + } + goto unlock; + } + btrfs_end_transaction(trans, root); + } + must_cow: + /* + * this will cow the extent, reset the len in case we changed + * it above + */ + len = bh_result->b_size; + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) + return PTR_ERR(em); + len = min(len, em->len - (start - em->start)); + unlock: + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, + 0, NULL, GFP_NOFS); + map: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + free_extent_map(em); + + return 0; + } + + struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + u32 *csums; + void *private; + }; + + static void btrfs_endio_direct_read(struct bio *bio, int err) + { + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start; + u32 *private = dip->csums; + + start = dip->logical_offset; + do { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_IRQ0); + csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_IRQ0); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != *private) { + printk(KERN_ERR "btrfs csum failed ino %lu off" + " %llu csum %u private %u\n", + inode->i_ino, (unsigned long long)start, + csum, *private); + err = -EIO; + } + } + + start += bvec->bv_len; + private++; + bvec++; + } while (bvec <= bvec_end); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1, GFP_NOFS); + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); + } + + static void btrfs_endio_direct_write(struct bio *bio, int err) + { + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered = NULL; + struct extent_state *cached_state = NULL; + int ret; + + if (err) + goto out_done; + + ret = btrfs_dec_test_ordered_pending(inode, &ordered, + dip->logical_offset, dip->bytes); + if (!ret) + goto out_done; + + BUG_ON(!ordered); + + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out; + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + ret = btrfs_update_inode(trans, root, inode); + err = ret; + goto out; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, 0, + &cached_state, GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + ret = btrfs_mark_extent_written(trans, inode, + ordered->file_offset, + ordered->file_offset + + ordered->len); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered->file_offset, + ordered->start, + ordered->disk_len, + ordered->len, + ordered->len, + 0, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered->file_offset, ordered->len); + if (ret) { + err = ret; + WARN_ON(1); + goto out_unlock; + } + } + + add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); + btrfs_ordered_update_i_size(inode, 0, ordered); + btrfs_update_inode(trans, root, inode); + out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, + &cached_state, GFP_NOFS); + out: + btrfs_delalloc_release_metadata(inode, ordered->len); + btrfs_end_transaction(trans, root); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + out_done: + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); + } + + static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) + { + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); + return 0; + } + + static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) + { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio_vec *bvec = bio->bi_io_vec; + u64 start; + int skip_sum; + int write = rw & (1 << BIO_RW); + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_ordered; + } + dip->csums = NULL; + + if (!skip_sum) { + dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); + if (!dip->csums) { + ret = -ENOMEM; + goto free_ordered; + } + } + + dip->private = bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + + start = dip->logical_offset; + dip->bytes = 0; + do { + dip->bytes += bvec->bv_len; + bvec++; + } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + + dip->disk_bytenr = (u64)bio->bi_sector << 9; + bio->bi_private = dip; + + if (write) + bio->bi_end_io = btrfs_endio_direct_write; + else + bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto out_err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, 0, 0, + dip->logical_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + if (ret) + goto out_err; + return; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + dip->logical_offset, dip->csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); + if (ret) + goto out_err; + return; + out_err: + kfree(dip->csums); + kfree(dip); + free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, + dip->logical_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(bio, ret); + } + + static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) + { + int seg; + size_t size; + unsigned long addr; + unsigned blocksize_mask = root->sectorsize - 1; + ssize_t retval = -EINVAL; + loff_t end = offset; + + if (offset & blocksize_mask) + goto out; + + /* Check the memory alignment. Blocks cannot straddle pages */ + for (seg = 0; seg < nr_segs; seg++) { + addr = (unsigned long)iov[seg].iov_base; + size = iov[seg].iov_len; + end += size; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + goto out; + } + retval = 0; + out: + return retval; + } static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - return -EINVAL; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + u64 lockstart, lockend; + ssize_t ret; + int writing = rw & WRITE; + int write_bits = 0; + size_t count = iov_length(iov, nr_segs); + + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, + offset, nr_segs)) { + return 0; + } + + lockstart = offset; + lockend = offset + count - 1; + + if (writing) { + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state, GFP_NOFS); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) + break; + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + + /* + * we don't use btrfs_set_extent_delalloc because we don't want + * the dirty or uptodate bits + */ + if (writing) { + write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DELALLOC, 0, NULL, &cached_state, + GFP_NOFS); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_LOCKED | write_bits, + 1, 0, &cached_state, GFP_NOFS); + goto out; + } + } + + free_extent_state(cached_state); + cached_state = NULL; + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (ret < 0 && ret != -EIOCBQUEUED) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { + /* + * We're falling back to buffered, unlock the section we didn't + * do IO on. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } + out: + free_extent_state(cached_state); + return ret; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@@ -5034,7 -6012,7 +6003,7 @@@ int btrfs_page_mkwrite(struct vm_area_s u64 page_start; u64 page_end; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; @@@ -5043,13 -6021,6 +6012,6 @@@ goto out; } - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - ret = VM_FAULT_SIGBUS; - goto out; - } - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@@ -5059,7 -6030,6 +6021,6 @@@ if ((page->mapping != inode->i_mapping) || (page_start >= size)) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@@ -5100,7 -6070,6 +6061,6 @@@ unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); ret = VM_FAULT_SIGBUS; - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; } ret = 0; @@@ -5127,10 -6096,10 +6087,10 @@@ unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: return ret; } @@@ -5155,8 -6124,10 +6115,10 @@@ static void btrfs_truncate(struct inod btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; /* * setattr is responsible for setting the ordered_data_close flag, @@@ -5179,6 -6150,23 +6141,23 @@@ btrfs_add_ordered_operation(trans, root, inode); while (1) { + if (!trans) { + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + } + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + trans = NULL; + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@@ -5190,10 -6178,8 +6169,8 @@@ nr = trans->blocks_used; btrfs_end_transaction(trans, root); + trans = NULL; btrfs_btree_balance_dirty(root, nr); - - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); } if (ret == 0 && inode->i_nlink > 0) { @@@ -5254,21 -6240,47 +6231,47 @@@ unsigned long btrfs_force_ra(struct add struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; + struct inode *inode; ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; + + ei->root = NULL; + ei->space_info = NULL; + ei->generation = 0; + ei->sequence = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; - ei->outstanding_extents = 0; - ei->reserved_extents = 0; - ei->root = NULL; + ei->delalloc_bytes = 0; + ei->reserved_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->index_cnt = (u64)-1; + ei->last_unlink_trans = 0; + spin_lock_init(&ei->accounting_lock); + atomic_set(&ei->outstanding_extents, 0); + ei->reserved_extents = 0; + + ei->ordered_data_close = 0; + ei->orphan_meta_reserved = 0; + ei->dummy_inode = 0; + ei->force_compress = 0; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree, GFP_NOFS); + extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); + extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); + mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); - return &ei->vfs_inode; + RB_CLEAR_NODE(&ei->rb_node); + + return inode; } void btrfs_destroy_inode(struct inode *inode) @@@ -5278,6 -6290,8 +6281,8 @@@ WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); + WARN_ON(BTRFS_I(inode)->reserved_extents); /* * This can happen where we create an inode, but somebody else also @@@ -5298,13 -6312,13 +6303,13 @@@ spin_unlock(&root->fs_info->ordered_extent_lock); } - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", inode->i_ino); list_del_init(&BTRFS_I(inode)->i_orphan); } - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@@ -5425,19 -6439,6 +6430,6 @@@ static int btrfs_rename(struct inode *o if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - - /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - */ - ret = btrfs_reserve_metadata_space(root, 11); - if (ret) - return ret; - /* * we're using rename to replace one file with another. * and the replacement file is large. Start IO on it now so @@@ -5450,8 -6451,18 +6442,18 @@@ /* close the racy window with snapshot create/destroy ioctl */ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&root->fs_info->subvol_sem); + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 20); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, new_dir); if (dest != root) @@@ -5550,7 -6561,6 +6552,6 @@@ out_fail if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 11); return ret; } @@@ -5602,6 -6612,38 +6603,38 @@@ int btrfs_start_delalloc_inodes(struct return 0; } + int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) + { + struct btrfs_inode *binode; + struct inode *inode = NULL; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(&root->fs_info->delalloc_inodes)) { + binode = list_entry(root->fs_info->delalloc_inodes.next, + struct btrfs_inode, delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (inode) { + list_move_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); + break; + } + + list_del_init(&binode->delalloc_inodes); + cond_resched_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + if (inode) { + write_inode_now(inode, 0); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + return 1; + } + return 0; + } + static int btrfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@@ -5625,26 -6667,20 +6658,20 @@@ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 items for inode item and ref * 2 items for dir items * 1 item for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto out_fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@@ -5716,8 -6752,6 +6743,6 @@@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - out_fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@@ -5726,33 -6760,28 +6751,28 @@@ return err; } - static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode, loff_t actual_len) + int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; - u64 num_bytes = end - start; int ret = 0; - u64 i_size; while (num_bytes > 0) { - trans = btrfs_start_transaction(root, 1); - - ret = btrfs_reserve_extent(trans, root, num_bytes, - root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - if (ret) { - WARN_ON(1); - goto stop_trans; + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; } - ret = btrfs_reserve_metadata_space(root, 3); + ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, + 0, *alloc_hint, (u64)-1, &ins, 1); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, - ins.offset); - goto stop_trans; + btrfs_end_transaction(trans, root); + break; } ret = insert_reserved_file_extent(trans, inode, @@@ -5766,34 -6795,27 +6786,27 @@@ num_bytes -= ins.offset; cur_offset += ins.offset; - alloc_hint = ins.objectid + ins.offset; + *alloc_hint = ins.objectid + ins.offset; inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (actual_len > inode->i_size) && - (cur_offset > inode->i_size)) { - + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size = actual_len; + i_size_write(inode, actual_len); else - i_size = cur_offset; - i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + i_size_write(inode, cur_offset); + i_size_write(inode, cur_offset); + btrfs_ordered_update_i_size(inode, cur_offset, NULL); } ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 3); } return ret; - - stop_trans: - btrfs_end_transaction(trans, root); - return ret; - } static long btrfs_fallocate(struct inode *inode, int mode, @@@ -5826,8 -6848,7 +6839,7 @@@ goto out; } - ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) goto out; @@@ -5872,16 -6893,16 +6884,16 @@@ if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = prealloc_file_range(inode, - cur_offset, last_byte, - alloc_hint, mode, offset+len); + ret = btrfs_prealloc_file_range(inode, 0, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); if (ret < 0) { free_extent_map(em); break; } } - if (em->block_start <= EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; free_extent_map(em); cur_offset = last_byte; @@@ -5893,8 -6914,7 +6905,7 @@@ unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --combined fs/btrfs/super.c index 2909a03e5230,574285c8cbd4..d34b2dfc9628 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@@ -498,7 -498,7 +498,7 @@@ int btrfs_sync_fs(struct super_block *s btrfs_start_delalloc_inodes(root, 0); btrfs_wait_ordered_extents(root, 0, 0); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_commit_transaction(trans, root); return ret; } @@@ -694,11 -694,11 +694,11 @@@ static int btrfs_remount(struct super_b if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - /* recover relocation */ - ret = btrfs_recover_relocation(root); + ret = btrfs_cleanup_fs_roots(root->fs_info); WARN_ON(ret); - ret = btrfs_cleanup_fs_roots(root->fs_info); + /* recover relocation */ + ret = btrfs_recover_relocation(root); WARN_ON(ret); sb->s_flags &= ~MS_RDONLY; @@@ -714,34 -714,18 +714,18 @@@ static int btrfs_statfs(struct dentry * struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 data_used = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_DUP| - BTRFS_BLOCK_GROUP_RAID10| - BTRFS_BLOCK_GROUP_RAID1)) { - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } - - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } + list_for_each_entry_rcu(found, head, list) + total_used += found->disk_used; rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (data_used >> bits); + buf->f_bavail = buf->f_bfree; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; @@@ -832,14 -816,11 +816,14 @@@ static const struct file_operations btr }; static struct miscdevice btrfs_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = BTRFS_MINOR, .name = "btrfs-control", .fops = &btrfs_ctl_fops }; +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + static int btrfs_interface_init(void) { return misc_register(&btrfs_misc); diff --combined fs/btrfs/xattr.c index 59acd3eb288a,007fae581a04..88ecbb215878 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@@ -154,15 -154,10 +154,10 @@@ int __btrfs_setxattr(struct btrfs_trans if (trans) return do_setxattr(trans, inode, name, value, size, flags); - ret = btrfs_reserve_metadata_space(root, 2); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } btrfs_set_trans_block_group(trans, inode); ret = do_setxattr(trans, inode, name, value, size, flags); @@@ -174,7 -169,6 +169,6 @@@ BUG_ON(ret); out: btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 2); return ret; } @@@ -282,7 -276,7 +276,7 @@@ err * List of handlers for synthetic system.* attributes. All real ondisk * attributes are handled directly. */ -struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, diff --combined include/linux/fs.h index 9682d52d1507,10704f0086c8..85e823adcd4a --- a/include/linux/fs.h +++ b/include/linux/fs.h @@@ -651,7 -651,6 +651,7 @@@ struct block_device int bd_openers; struct mutex bd_mutex; /* open/close mutex */ struct list_head bd_inodes; + void * bd_claiming; void * bd_holder; int bd_holders; #ifdef CONFIG_SYSFS @@@ -1281,12 -1280,10 +1281,12 @@@ static inline int lock_may_write(struc struct fasync_struct { - int magic; - int fa_fd; - struct fasync_struct *fa_next; /* singly linked list */ - struct file *fa_file; + spinlock_t fa_lock; + int magic; + int fa_fd; + struct fasync_struct *fa_next; /* singly linked list */ + struct file *fa_file; + struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 @@@ -1295,6 -1292,8 +1295,6 @@@ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); -/* only for net: no internal synchronization */ -extern void __kill_fasync(struct fasync_struct *, int, int); extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, unsigned long arg, int force); @@@ -1315,6 -1314,8 +1315,6 @@@ extern int send_sigurg(struct fown_stru extern struct list_head super_blocks; extern spinlock_t sb_lock; -#define sb_entry(list) list_entry((list), struct super_block, s_list) -#define S_BIAS (1<<30) struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@@ -1333,11 -1334,12 +1333,11 @@@ struct rw_semaphore s_umount; struct mutex s_lock; int s_count; - int s_need_sync; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif - struct xattr_handler **s_xattr; + const struct xattr_handler **s_xattr; struct list_head s_inodes; /* all inodes */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ @@@ -1429,8 -1431,7 +1429,8 @@@ extern void dentry_unhash(struct dentr * VFS file helper functions. */ extern int file_permission(struct file *, int); - +extern void inode_init_owner(struct inode *inode, const struct inode *dir, + mode_t mode); /* * VFS FS_IOC_FIEMAP helper definitions. */ @@@ -1743,7 -1744,6 +1743,7 @@@ struct file_system_type struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; + struct lock_class_key s_vfs_rename_key; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; @@@ -1781,6 -1781,8 +1781,6 @@@ extern int get_sb_pseudo(struct file_sy const struct super_operations *ops, unsigned long, struct vfsmount *mnt); extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); -int __put_super_and_need_restart(struct super_block *sb); -void put_super(struct super_block *sb); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) \ @@@ -1800,8 -1802,6 +1800,8 @@@ extern void drop_collected_mounts(struc extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +extern int freeze_super(struct super_block *super); +extern int thaw_super(struct super_block *super); extern int current_umask(void); @@@ -2087,9 -2087,9 +2087,9 @@@ extern int __filemap_fdatawrite_range(s extern int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); -extern int vfs_fsync_range(struct file *file, struct dentry *dentry, - loff_t start, loff_t end, int datasync); -extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync); +extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, + int datasync); +extern int vfs_fsync(struct file *file, int datasync); extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); extern void sync_supers(void); extern void emergency_sync(void); @@@ -2228,7 -2228,6 +2228,7 @@@ extern long do_splice_direct(struct fil extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); +extern loff_t noop_llseek(struct file *file, loff_t offset, int origin); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, @@@ -2251,10 -2250,15 +2251,15 @@@ static inline int xip_truncate_page(str #endif #ifdef CONFIG_BLOCK + struct bio; + typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset); + void dio_end_io(struct bio *bio, int error); + ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int lock_type); + dio_submit_t submit_io, int lock_type); enum { /* need locking between buffered and direct access */ @@@ -2270,7 -2274,7 +2275,7 @@@ static inline ssize_t blockdev_direct_I dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, + nr_segs, get_block, end_io, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } @@@ -2280,7 -2284,7 +2285,7 @@@ static inline ssize_t blockdev_direct_I dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, 0); + nr_segs, get_block, end_io, NULL, 0); } #endif @@@ -2330,7 -2334,6 +2335,7 @@@ extern struct super_block *get_super(st extern struct super_block *get_active_super(struct block_device *bdev); extern struct super_block *user_get_super(dev_t); extern void drop_super(struct super_block *sb); +extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); @@@ -2364,8 -2367,6 +2369,8 @@@ extern void simple_release_fs(struct vf extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); +extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, + const void __user *from, size_t count); extern int simple_fsync(struct file *, struct dentry *, int); diff --combined mm/filemap.c index 35e12d186566,829ac9cdbd70..45a2d18df849 --- a/mm/filemap.c +++ b/mm/filemap.c @@@ -441,7 -441,7 +441,7 @@@ int add_to_page_cache_lru(struct page * /* * Splice_read and readahead add shmem/tmpfs pages into the page cache * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * need to go on the anon lru below, and mem_cgroup_cache_charge * (called in add_to_page_cache) needs to know where they're going too. */ if (mapping_cap_swap_backed(mapping)) @@@ -452,7 -452,7 +452,7 @@@ if (page_is_file_cache(page)) lru_cache_add_file(page); else - lru_cache_add_active_anon(page); + lru_cache_add_anon(page); } return ret; } @@@ -461,15 -461,9 +461,15 @@@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_exact_node(n, gfp, 0); + get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + put_mems_allowed(); + return page; } return alloc_pages(gfp, 0); } @@@ -1105,12 -1099,6 +1105,12 @@@ page_not_up_to_date_locked } readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); @@@ -1275,7 -1263,7 +1275,7 @@@ generic_file_aio_read(struct kiocb *ioc { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg; + unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; @@@ -1302,21 -1290,47 +1302,47 @@@ retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); } - if (retval > 0) + if (retval > 0) { *ppos = pos + retval; - if (retval) { + count -= retval; + } + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !count || *ppos >= size) { file_accessed(filp); goto out; } } } + count = retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; + loff_t offset = 0; + + /* + * If we did a short DIO read we need to skip the section of the + * iov that we've already read data into. + */ + if (count) { + if (count > iov[seg].iov_len) { + count -= iov[seg].iov_len; + continue; + } + offset = count; + count = 0; + } desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; + desc.arg.buf = iov[seg].iov_base + offset; + desc.count = iov[seg].iov_len - offset; if (desc.count == 0) continue; desc.error = 0;