kfree(ctl);
}
+ #ifdef CONFIG_BTRFS_DEBUG
+ static void fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+ {
+ u64 start = block_group->key.objectid;
+ u64 len = block_group->key.offset;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ root->nodesize : root->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+ }
+ #endif
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
u64 last = 0;
u32 nritems;
int ret = -ENOMEM;
+ bool wakeup = true;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+ #ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(extent_root, block_group))
+ wakeup = false;
+ #endif
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
if (total_found > (1024 * 1024 * 2)) {
total_found = 0;
- wake_up(&caching_ctl->wait);
+ if (wakeup)
+ wake_up(&caching_ctl->wait);
}
}
path->slots[0]++;
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
- caching_ctl->progress = (u64)-1;
-
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
+ #ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ u64 bytes_used;
+
+ spin_lock(&block_group->space_info->lock);
+ spin_lock(&block_group->lock);
+ bytes_used = block_group->key.offset -
+ btrfs_block_group_used(&block_group->item);
+ block_group->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(extent_root, block_group);
+ }
+ #endif
+
+ caching_ctl->progress = (u64)-1;
err:
btrfs_free_path(path);
up_read(&fs_info->commit_root_sem);
}
}
spin_unlock(&cache->lock);
+ #ifdef CONFIG_BTRFS_DEBUG
+ if (ret == 1 &&
+ btrfs_should_fragment_free_space(fs_info->extent_root,
+ cache)) {
+ u64 bytes_used;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ bytes_used = cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fragment_free_space(fs_info->extent_root, cache);
+ }
+ #endif
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
node->num_bytes);
}
}
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(root->fs_info,
+ head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
}
spin_unlock(&block_group->lock);
+ /*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
/*
* Try to preallocate enough space based on how big the block group is.
* Keep in mind this has to include any pinned space which could end up
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
num_pages, num_pages,
&alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
if (!ret)
dcs = BTRFS_DC_SETUP;
- btrfs_free_reserved_data_space(inode, num_pages);
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
found->bytes_readonly = 0;
found->bytes_may_use = 0;
found->full = 0;
+ found->max_extent_size = 0;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
return ret;
}
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
if (IS_ERR(trans))
return PTR_ERR(trans);
if (have_pinned_space >= 0 ||
- trans->transaction->have_free_bgs ||
+ test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ &trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
if (ret)
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- ret = btrfs_qgroup_reserve(root, write_bytes);
- if (ret)
- goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
-out:
spin_unlock(&data_sinfo->lock);
return ret;
}
/*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
+
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ *
+ * TODO: Find a good method to avoid reserve data space for NOCOW
+ * range, but don't impact performance on quota disable case.
+ */
+ ret = btrfs_qgroup_reserve_data(inode, start, len);
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
- /* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, root->sectorsize);
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- WARN_ON(data_sinfo->bytes_may_use < bytes);
- data_sinfo->bytes_may_use -= bytes;
+ if (WARN_ON(data_sinfo->bytes_may_use < len))
+ data_sinfo->bytes_may_use = 0;
+ else
+ data_sinfo->bytes_may_use -= len;
trace_btrfs_space_reservation(root->fs_info, "space_info",
- data_sinfo->flags, bytes, 0);
+ data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_qgroup_free_data(inode, start, len);
+}
+
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
- ret = btrfs_qgroup_reserve(root, num_bytes);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
- if (ret) {
- if (*qgroup_reserved)
- btrfs_qgroup_free(root, *qgroup_reserved);
- }
+ if (ret && *qgroup_reserved)
+ btrfs_qgroup_free_meta(root, *qgroup_reserved);
return ret;
}
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve_meta(root,
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
}
/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
* @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
*
* This will do the following things
*
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ * also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
* o add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
*
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- if (ret) {
- btrfs_free_reserved_data_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, start, len);
+ if (ret < 0)
return ret;
- }
-
- return 0;
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space(inode, start, len);
+ return ret;
}
/**
* btrfs_delalloc_release_space - release data and metadata space for delalloc
* @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
*
* This must be matched with a call to btrfs_delalloc_reserve_space. This is
* called in the case that we don't need the metadata AND data reservations
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
{
- btrfs_delalloc_release_metadata(inode, num_bytes);
- btrfs_free_reserved_data_space(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, len);
+ btrfs_free_reserved_data_space(inode, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
+ /*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+ static struct btrfs_free_cluster *
+ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+ u64 *empty_cluster)
+ {
+ struct btrfs_free_cluster *ret = NULL;
+ bool ssd = btrfs_test_opt(root, SSD);
+
+ *empty_cluster = 0;
+ if (btrfs_mixed_space_info(space_info))
+ return ret;
+
+ if (ssd)
+ *empty_cluster = 2 * 1024 * 1024;
+ if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ ret = &root->fs_info->meta_alloc_cluster;
+ if (!ssd)
+ *empty_cluster = 64 * 1024;
+ } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+ ret = &root->fs_info->data_alloc_cluster;
+ }
+
+ return ret;
+ }
+
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
const bool return_free_space)
{
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_free_cluster *cluster = NULL;
u64 len;
+ u64 total_unpinned = 0;
+ u64 empty_cluster = 0;
bool readonly;
while (start <= end) {
start >= cache->key.objectid + cache->key.offset) {
if (cache)
btrfs_put_block_group(cache);
+ total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache); /* Logic error */
+
+ cluster = fetch_cluster_info(root,
+ cache->space_info,
+ &empty_cluster);
+ empty_cluster <<= 1;
}
len = cache->key.objectid + cache->key.offset - start;
}
start += len;
+ total_unpinned += len;
space_info = cache->space_info;
+ /*
+ * If this space cluster has been marked as fragmented and we've
+ * unpinned enough in this block group to potentially allow a
+ * cluster to be created inside of it go ahead and clear the
+ * fragmented check.
+ */
+ if (cluster && cluster->fragmented &&
+ total_unpinned > empty_cluster) {
+ spin_lock(&cluster->lock);
+ cluster->fragmented = 0;
+ spin_unlock(&cluster->lock);
+ }
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
- int empty_cluster = 2 * 1024 * 1024;
+ u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
+ bool full_search = false;
WARN_ON(num_bytes < root->sectorsize);
ins->type = BTRFS_EXTENT_ITEM_KEY;
}
/*
- * If the space info is for both data and metadata it means we have a
- * small filesystem and we can't use the clustering stuff.
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
*/
- if (btrfs_mixed_space_info(space_info))
- use_cluster = false;
-
- if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
- last_ptr = &root->fs_info->meta_alloc_cluster;
- if (!btrfs_test_opt(root, SSD))
- empty_cluster = 64 * 1024;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
- btrfs_test_opt(root, SSD)) {
- last_ptr = &root->fs_info->data_alloc_cluster;
+ if (unlikely(space_info->max_extent_size)) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
}
+ last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ hint_byte = last_ptr->window_start;
+ use_cluster = false;
+ }
spin_unlock(&last_ptr->lock);
}
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
-
- if (!last_ptr)
- empty_cluster = 0;
-
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
}
search:
have_caching_bg = false;
+ if (index == 0 || index == __get_raid_index(flags))
+ full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
+ have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
* Ok we want to try and use the cluster allocator, so
* lets look there
*/
- if (last_ptr) {
+ if (last_ptr && use_cluster) {
struct btrfs_block_group_cache *used_block_group;
unsigned long aligned_cluster;
/*
}
unclustered_alloc:
+ /*
+ * We are doing an unclustered alloc, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get
+ * more space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
- if (!cached)
- have_caching_bg = true;
goto loop;
}
checks:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- loop++;
+ if (loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any unached bgs and we've alrelady done a
+ * full search through.
+ */
+ if (have_caching_bg || !full_search)
+ loop = LOOP_CACHING_WAIT;
+ else
+ loop = LOOP_ALLOC_CHUNK;
+ } else {
+ loop++;
+ }
+
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
ret = do_chunk_alloc(trans, root, flags,
CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ loop = LOOP_NO_EMPTY_SIZE;
+
/*
* Do not bail out on ENOSPC since we
* can do more things.
}
if (loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (empty_size == 0 &&
+ empty_cluster == 0) {
+ ret = -ENOSPC;
+ goto out;
+ }
empty_size = 0;
empty_cluster = 0;
}
} else if (!ins->objectid) {
ret = -ENOSPC;
} else if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
ret = 0;
}
out:
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ spin_lock(&space_info->lock);
+ space_info->max_extent_size = max_extent_size;
+ spin_unlock(&space_info->lock);
ins->offset = max_extent_size;
+ }
return ret;
}
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
{
- bool final_tried = false;
+ bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
* back off and let this transaction commit
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (trans->transaction->dirty_bg_run) {
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
mutex_unlock(&root->fs_info->ro_block_group_mutex);
free_excluded_extents(root, cache);
+ #ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(root, cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ bytes_used += new_bytes_used >> 1;
+ fragment_free_space(root, cache);
+ }
+ #endif
/*
* Call to ensure the corresponding space_info object is created and
* assigned to our block group, but don't update its counters just yet.
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
+ /*
+ * Don't forget to free the reserved space, as for inlined extent
+ * it won't count as data extent, free them directly here.
+ * And at reserve time, it's always aligned to page size, so
+ * just free one page here.
+ */
+ btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& do_list && !(state->state & EXTENT_NORESERVE))
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space_noquota(inode,
+ state->start, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
root->fs_info->delalloc_batch);
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
btrfs_ino(inode), file_pos, &ins);
+ if (ret < 0)
+ goto out;
+ /*
+ * Release the reserved range from inode dirty range map, and
+ * move it to delayed ref codes, as now accounting only happens at
+ * commit_transaction() time.
+ */
+ btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+ ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+ root->objectid, disk_bytenr, ram_bytes);
out:
btrfs_free_path(path);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+ /*
+ * For mwrite(mmap + memset to write) case, we still reserve
+ * space for NOCOW range.
+ * As NOCOW won't cause a new delayed ref, just free the space
+ */
+ btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode,
+ round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
if (ret)
goto out;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode,
+ round_down(from, PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
goto out;
}
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start,
+ PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+ /*
+ * If still has DELALLOC flag, the extent didn't reach disk,
+ * and its reserved space won't be freed by delayed_ref.
+ * So we need to free its reserved space here.
+ * (Refer to comment in btrfs_invalidatepage, case 2)
+ *
+ * Note, end is the bytenr of last byte, so we need + 1 here.
+ */
+ if (state->state & EXTENT_DELALLOC)
+ btrfs_qgroup_free_data(inode, start, end - start + 1);
+
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
spin_unlock(&BTRFS_I(inode)->lock);
}
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
current->journal_info = dio_data;
mutex_unlock(&inode->i_mutex);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, count);
+ ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
goto out;
dio_data.outstanding_extents = div64_u64(count +
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
- btrfs_delalloc_release_space(inode,
- dio_data.reserve);
+ btrfs_delalloc_release_space(inode, offset,
+ dio_data.reserve);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
}
out:
}
}
+ /*
+ * Qgroup reserved space handler
+ * Page here will be either
+ * 1) Already written to disk
+ * In this case, its reserved space is released from data rsv map
+ * and will be freed by delayed_ref handler finally.
+ * So even we call qgroup_free_data(), it won't decrease reserved
+ * space.
+ * 2) Not written to disk
+ * This means the reserved space should be freed here.
+ */
+ btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
u64 page_end;
sb_start_pagefault(inode->i_sb);
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
again:
lock_page(page);
size = i_size_read(inode);
- page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
out_noreserve:
sb_end_pagefault(inode->i_sb);
return ret;
btrfs_put_ordered_extent(ordered);
}
}
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
u64 cur_offset = start;
u64 i_size;
u64 cur_bytes;
+ u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
+ /*
+ * If we are severely fragmented we could end up with really
+ * small allocations, so if the allocator is returning small
+ * chunks lets make its job easier by only searching for those
+ * sized chunks.
+ */
+ cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
*alloc_hint, &ins, 1, 0);
if (ret) {
break;
}
+ last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
cur_offset, ins.objectid,
ins.offset, ins.offset,
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
+ init_waitqueue_head(&cur_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->have_free_bgs = 0;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
- cur_trans->dirty_bg_run = 0;
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
- INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
INIT_LIST_HEAD(&cur_trans->io_bgs);
INIT_LIST_HEAD(&cur_trans->dropped_roots);
* the appropriate flushing if need be.
*/
if (num_items > 0 && root != root->fs_info->chunk_root) {
- if (root->fs_info->quota_enabled &&
- is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->nodesize;
- ret = btrfs_qgroup_reserve(root, qgroup_reserved);
- if (ret)
- return ERR_PTR(ret);
- }
+ qgroup_reserved = num_items * root->nodesize;
+ ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+ if (ret)
+ return ERR_PTR(ret);
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
/*
h->transaction = cur_trans;
h->root = root;
h->use_count = 1;
+
h->type = type;
h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
- INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
- h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
- if (qgroup_reserved)
- btrfs_qgroup_free(root, qgroup_reserved);
+ btrfs_qgroup_free_meta(root, qgroup_reserved);
return ERR_PTR(ret);
}
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- if (!list_empty(&trans->ordered)) {
- spin_lock(&info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
- spin_unlock(&info->trans_lock);
- }
-
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
must_run_delayed_refs = 2;
}
- if (trans->qgroup_reserved) {
- /*
- * the same root has to be passed here between start_transaction
- * and end_transaction. Subvolume quota depends on this.
- */
- btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
+ btrfs_qgroup_free_meta_all(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
}
static inline void
- btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+ btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
- &ordered->flags));
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
cur_trans = trans->transaction;
return ret;
}
- if (!cur_trans->dirty_bg_run) {
+ if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
/* this mutex is also taken before trying to set
* after a extents from that block group have been
* allocated for cache files. btrfs_set_block_group_ro
* will wait for the transaction to commit if it
- * finds dirty_bg_run = 1
+ * finds BTRFS_TRANS_DIRTY_BG_RUN set.
*
- * The dirty_bg_run flag is also used to make sure only
- * one process starts all the block group IO. It wouldn't
+ * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+ * only one process starts all the block group IO. It wouldn't
* hurt to have more than one go through, but there's no
* real advantage to it either.
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (!cur_trans->dirty_bg_run) {
+ if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+ &cur_trans->flags))
run_it = 1;
- cur_trans->dirty_bg_run = 1;
- }
mutex_unlock(&root->fs_info->ro_block_group_mutex);
if (run_it)
}
spin_lock(&root->fs_info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
btrfs_wait_delalloc_flush(root->fs_info);
- btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans);
btrfs_scrub_pause(root);
/*
btrfs_finish_extent_commit(trans, root);
- if (cur_trans->have_free_bgs)
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(root->fs_info);
root->fs_info->last_trans_committed = cur_trans->transid;
btrfs_trans_release_metadata(trans, root);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;