btrfs: use standard debug config option to enable free-space-cache debug prints
[linux.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19                       *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21                       const struct btrfs_key *ins_key, struct btrfs_path *path,
22                       int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24                           struct extent_buffer *dst,
25                           struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 static const struct btrfs_csums {
33         u16             size;
34         const char      *name;
35         const char      *driver;
36 } btrfs_csums[] = {
37         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41                                      .driver = "blake2b-256" },
42 };
43
44 int btrfs_super_csum_size(const struct btrfs_super_block *s)
45 {
46         u16 t = btrfs_super_csum_type(s);
47         /*
48          * csum type is validated at mount time
49          */
50         return btrfs_csums[t].size;
51 }
52
53 const char *btrfs_super_csum_name(u16 csum_type)
54 {
55         /* csum type is validated at mount time */
56         return btrfs_csums[csum_type].name;
57 }
58
59 /*
60  * Return driver name if defined, otherwise the name that's also a valid driver
61  * name
62  */
63 const char *btrfs_super_csum_driver(u16 csum_type)
64 {
65         /* csum type is validated at mount time */
66         return btrfs_csums[csum_type].driver ?:
67                 btrfs_csums[csum_type].name;
68 }
69
70 size_t __const btrfs_get_num_csums(void)
71 {
72         return ARRAY_SIZE(btrfs_csums);
73 }
74
75 struct btrfs_path *btrfs_alloc_path(void)
76 {
77         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
78 }
79
80 /* this also releases the path */
81 void btrfs_free_path(struct btrfs_path *p)
82 {
83         if (!p)
84                 return;
85         btrfs_release_path(p);
86         kmem_cache_free(btrfs_path_cachep, p);
87 }
88
89 /*
90  * path release drops references on the extent buffers in the path
91  * and it drops any locks held by this path
92  *
93  * It is safe to call this on paths that no locks or extent buffers held.
94  */
95 noinline void btrfs_release_path(struct btrfs_path *p)
96 {
97         int i;
98
99         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
100                 p->slots[i] = 0;
101                 if (!p->nodes[i])
102                         continue;
103                 if (p->locks[i]) {
104                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
105                         p->locks[i] = 0;
106                 }
107                 free_extent_buffer(p->nodes[i]);
108                 p->nodes[i] = NULL;
109         }
110 }
111
112 /*
113  * safely gets a reference on the root node of a tree.  A lock
114  * is not taken, so a concurrent writer may put a different node
115  * at the root of the tree.  See btrfs_lock_root_node for the
116  * looping required.
117  *
118  * The extent buffer returned by this has a reference taken, so
119  * it won't disappear.  It may stop being the root of the tree
120  * at any time because there are no locks held.
121  */
122 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
123 {
124         struct extent_buffer *eb;
125
126         while (1) {
127                 rcu_read_lock();
128                 eb = rcu_dereference(root->node);
129
130                 /*
131                  * RCU really hurts here, we could free up the root node because
132                  * it was COWed but we may not get the new root node yet so do
133                  * the inc_not_zero dance and if it doesn't work then
134                  * synchronize_rcu and try again.
135                  */
136                 if (atomic_inc_not_zero(&eb->refs)) {
137                         rcu_read_unlock();
138                         break;
139                 }
140                 rcu_read_unlock();
141                 synchronize_rcu();
142         }
143         return eb;
144 }
145
146 /* cowonly root (everything not a reference counted cow subvolume), just get
147  * put onto a simple dirty list.  transaction.c walks this to make sure they
148  * get properly updated on disk.
149  */
150 static void add_root_to_dirty_list(struct btrfs_root *root)
151 {
152         struct btrfs_fs_info *fs_info = root->fs_info;
153
154         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
155             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
156                 return;
157
158         spin_lock(&fs_info->trans_lock);
159         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
160                 /* Want the extent tree to be the last on the list */
161                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
162                         list_move_tail(&root->dirty_list,
163                                        &fs_info->dirty_cowonly_roots);
164                 else
165                         list_move(&root->dirty_list,
166                                   &fs_info->dirty_cowonly_roots);
167         }
168         spin_unlock(&fs_info->trans_lock);
169 }
170
171 /*
172  * used by snapshot creation to make a copy of a root for a tree with
173  * a given objectid.  The buffer with the new root node is returned in
174  * cow_ret, and this func returns zero on success or a negative error code.
175  */
176 int btrfs_copy_root(struct btrfs_trans_handle *trans,
177                       struct btrfs_root *root,
178                       struct extent_buffer *buf,
179                       struct extent_buffer **cow_ret, u64 new_root_objectid)
180 {
181         struct btrfs_fs_info *fs_info = root->fs_info;
182         struct extent_buffer *cow;
183         int ret = 0;
184         int level;
185         struct btrfs_disk_key disk_key;
186
187         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
188                 trans->transid != fs_info->running_transaction->transid);
189         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
190                 trans->transid != root->last_trans);
191
192         level = btrfs_header_level(buf);
193         if (level == 0)
194                 btrfs_item_key(buf, &disk_key, 0);
195         else
196                 btrfs_node_key(buf, &disk_key, 0);
197
198         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
199                         &disk_key, level, buf->start, 0);
200         if (IS_ERR(cow))
201                 return PTR_ERR(cow);
202
203         copy_extent_buffer_full(cow, buf);
204         btrfs_set_header_bytenr(cow, cow->start);
205         btrfs_set_header_generation(cow, trans->transid);
206         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
207         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
208                                      BTRFS_HEADER_FLAG_RELOC);
209         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
210                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
211         else
212                 btrfs_set_header_owner(cow, new_root_objectid);
213
214         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
215
216         WARN_ON(btrfs_header_generation(buf) > trans->transid);
217         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
218                 ret = btrfs_inc_ref(trans, root, cow, 1);
219         else
220                 ret = btrfs_inc_ref(trans, root, cow, 0);
221
222         if (ret)
223                 return ret;
224
225         btrfs_mark_buffer_dirty(cow);
226         *cow_ret = cow;
227         return 0;
228 }
229
230 enum mod_log_op {
231         MOD_LOG_KEY_REPLACE,
232         MOD_LOG_KEY_ADD,
233         MOD_LOG_KEY_REMOVE,
234         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
235         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
236         MOD_LOG_MOVE_KEYS,
237         MOD_LOG_ROOT_REPLACE,
238 };
239
240 struct tree_mod_root {
241         u64 logical;
242         u8 level;
243 };
244
245 struct tree_mod_elem {
246         struct rb_node node;
247         u64 logical;
248         u64 seq;
249         enum mod_log_op op;
250
251         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
252         int slot;
253
254         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
255         u64 generation;
256
257         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
258         struct btrfs_disk_key key;
259         u64 blockptr;
260
261         /* this is used for op == MOD_LOG_MOVE_KEYS */
262         struct {
263                 int dst_slot;
264                 int nr_items;
265         } move;
266
267         /* this is used for op == MOD_LOG_ROOT_REPLACE */
268         struct tree_mod_root old_root;
269 };
270
271 /*
272  * Pull a new tree mod seq number for our operation.
273  */
274 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
275 {
276         return atomic64_inc_return(&fs_info->tree_mod_seq);
277 }
278
279 /*
280  * This adds a new blocker to the tree mod log's blocker list if the @elem
281  * passed does not already have a sequence number set. So when a caller expects
282  * to record tree modifications, it should ensure to set elem->seq to zero
283  * before calling btrfs_get_tree_mod_seq.
284  * Returns a fresh, unused tree log modification sequence number, even if no new
285  * blocker was added.
286  */
287 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
288                            struct seq_list *elem)
289 {
290         write_lock(&fs_info->tree_mod_log_lock);
291         if (!elem->seq) {
292                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
293                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
294         }
295         write_unlock(&fs_info->tree_mod_log_lock);
296
297         return elem->seq;
298 }
299
300 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
301                             struct seq_list *elem)
302 {
303         struct rb_root *tm_root;
304         struct rb_node *node;
305         struct rb_node *next;
306         struct tree_mod_elem *tm;
307         u64 min_seq = (u64)-1;
308         u64 seq_putting = elem->seq;
309
310         if (!seq_putting)
311                 return;
312
313         write_lock(&fs_info->tree_mod_log_lock);
314         list_del(&elem->list);
315         elem->seq = 0;
316
317         if (!list_empty(&fs_info->tree_mod_seq_list)) {
318                 struct seq_list *first;
319
320                 first = list_first_entry(&fs_info->tree_mod_seq_list,
321                                          struct seq_list, list);
322                 if (seq_putting > first->seq) {
323                         /*
324                          * Blocker with lower sequence number exists, we
325                          * cannot remove anything from the log.
326                          */
327                         write_unlock(&fs_info->tree_mod_log_lock);
328                         return;
329                 }
330                 min_seq = first->seq;
331         }
332
333         /*
334          * anything that's lower than the lowest existing (read: blocked)
335          * sequence number can be removed from the tree.
336          */
337         tm_root = &fs_info->tree_mod_log;
338         for (node = rb_first(tm_root); node; node = next) {
339                 next = rb_next(node);
340                 tm = rb_entry(node, struct tree_mod_elem, node);
341                 if (tm->seq >= min_seq)
342                         continue;
343                 rb_erase(node, tm_root);
344                 kfree(tm);
345         }
346         write_unlock(&fs_info->tree_mod_log_lock);
347 }
348
349 /*
350  * key order of the log:
351  *       node/leaf start address -> sequence
352  *
353  * The 'start address' is the logical address of the *new* root node
354  * for root replace operations, or the logical address of the affected
355  * block for all other operations.
356  */
357 static noinline int
358 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
359 {
360         struct rb_root *tm_root;
361         struct rb_node **new;
362         struct rb_node *parent = NULL;
363         struct tree_mod_elem *cur;
364
365         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
366
367         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
368
369         tm_root = &fs_info->tree_mod_log;
370         new = &tm_root->rb_node;
371         while (*new) {
372                 cur = rb_entry(*new, struct tree_mod_elem, node);
373                 parent = *new;
374                 if (cur->logical < tm->logical)
375                         new = &((*new)->rb_left);
376                 else if (cur->logical > tm->logical)
377                         new = &((*new)->rb_right);
378                 else if (cur->seq < tm->seq)
379                         new = &((*new)->rb_left);
380                 else if (cur->seq > tm->seq)
381                         new = &((*new)->rb_right);
382                 else
383                         return -EEXIST;
384         }
385
386         rb_link_node(&tm->node, parent, new);
387         rb_insert_color(&tm->node, tm_root);
388         return 0;
389 }
390
391 /*
392  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
393  * returns zero with the tree_mod_log_lock acquired. The caller must hold
394  * this until all tree mod log insertions are recorded in the rb tree and then
395  * write unlock fs_info::tree_mod_log_lock.
396  */
397 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
398                                     struct extent_buffer *eb) {
399         smp_mb();
400         if (list_empty(&(fs_info)->tree_mod_seq_list))
401                 return 1;
402         if (eb && btrfs_header_level(eb) == 0)
403                 return 1;
404
405         write_lock(&fs_info->tree_mod_log_lock);
406         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
407                 write_unlock(&fs_info->tree_mod_log_lock);
408                 return 1;
409         }
410
411         return 0;
412 }
413
414 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
415 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
416                                     struct extent_buffer *eb)
417 {
418         smp_mb();
419         if (list_empty(&(fs_info)->tree_mod_seq_list))
420                 return 0;
421         if (eb && btrfs_header_level(eb) == 0)
422                 return 0;
423
424         return 1;
425 }
426
427 static struct tree_mod_elem *
428 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
429                     enum mod_log_op op, gfp_t flags)
430 {
431         struct tree_mod_elem *tm;
432
433         tm = kzalloc(sizeof(*tm), flags);
434         if (!tm)
435                 return NULL;
436
437         tm->logical = eb->start;
438         if (op != MOD_LOG_KEY_ADD) {
439                 btrfs_node_key(eb, &tm->key, slot);
440                 tm->blockptr = btrfs_node_blockptr(eb, slot);
441         }
442         tm->op = op;
443         tm->slot = slot;
444         tm->generation = btrfs_node_ptr_generation(eb, slot);
445         RB_CLEAR_NODE(&tm->node);
446
447         return tm;
448 }
449
450 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
451                 enum mod_log_op op, gfp_t flags)
452 {
453         struct tree_mod_elem *tm;
454         int ret;
455
456         if (!tree_mod_need_log(eb->fs_info, eb))
457                 return 0;
458
459         tm = alloc_tree_mod_elem(eb, slot, op, flags);
460         if (!tm)
461                 return -ENOMEM;
462
463         if (tree_mod_dont_log(eb->fs_info, eb)) {
464                 kfree(tm);
465                 return 0;
466         }
467
468         ret = __tree_mod_log_insert(eb->fs_info, tm);
469         write_unlock(&eb->fs_info->tree_mod_log_lock);
470         if (ret)
471                 kfree(tm);
472
473         return ret;
474 }
475
476 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
477                 int dst_slot, int src_slot, int nr_items)
478 {
479         struct tree_mod_elem *tm = NULL;
480         struct tree_mod_elem **tm_list = NULL;
481         int ret = 0;
482         int i;
483         int locked = 0;
484
485         if (!tree_mod_need_log(eb->fs_info, eb))
486                 return 0;
487
488         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
489         if (!tm_list)
490                 return -ENOMEM;
491
492         tm = kzalloc(sizeof(*tm), GFP_NOFS);
493         if (!tm) {
494                 ret = -ENOMEM;
495                 goto free_tms;
496         }
497
498         tm->logical = eb->start;
499         tm->slot = src_slot;
500         tm->move.dst_slot = dst_slot;
501         tm->move.nr_items = nr_items;
502         tm->op = MOD_LOG_MOVE_KEYS;
503
504         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
505                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
506                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
507                 if (!tm_list[i]) {
508                         ret = -ENOMEM;
509                         goto free_tms;
510                 }
511         }
512
513         if (tree_mod_dont_log(eb->fs_info, eb))
514                 goto free_tms;
515         locked = 1;
516
517         /*
518          * When we override something during the move, we log these removals.
519          * This can only happen when we move towards the beginning of the
520          * buffer, i.e. dst_slot < src_slot.
521          */
522         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
523                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
524                 if (ret)
525                         goto free_tms;
526         }
527
528         ret = __tree_mod_log_insert(eb->fs_info, tm);
529         if (ret)
530                 goto free_tms;
531         write_unlock(&eb->fs_info->tree_mod_log_lock);
532         kfree(tm_list);
533
534         return 0;
535 free_tms:
536         for (i = 0; i < nr_items; i++) {
537                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
538                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
539                 kfree(tm_list[i]);
540         }
541         if (locked)
542                 write_unlock(&eb->fs_info->tree_mod_log_lock);
543         kfree(tm_list);
544         kfree(tm);
545
546         return ret;
547 }
548
549 static inline int
550 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
551                        struct tree_mod_elem **tm_list,
552                        int nritems)
553 {
554         int i, j;
555         int ret;
556
557         for (i = nritems - 1; i >= 0; i--) {
558                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
559                 if (ret) {
560                         for (j = nritems - 1; j > i; j--)
561                                 rb_erase(&tm_list[j]->node,
562                                          &fs_info->tree_mod_log);
563                         return ret;
564                 }
565         }
566
567         return 0;
568 }
569
570 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
571                          struct extent_buffer *new_root, int log_removal)
572 {
573         struct btrfs_fs_info *fs_info = old_root->fs_info;
574         struct tree_mod_elem *tm = NULL;
575         struct tree_mod_elem **tm_list = NULL;
576         int nritems = 0;
577         int ret = 0;
578         int i;
579
580         if (!tree_mod_need_log(fs_info, NULL))
581                 return 0;
582
583         if (log_removal && btrfs_header_level(old_root) > 0) {
584                 nritems = btrfs_header_nritems(old_root);
585                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
586                                   GFP_NOFS);
587                 if (!tm_list) {
588                         ret = -ENOMEM;
589                         goto free_tms;
590                 }
591                 for (i = 0; i < nritems; i++) {
592                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
593                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
594                         if (!tm_list[i]) {
595                                 ret = -ENOMEM;
596                                 goto free_tms;
597                         }
598                 }
599         }
600
601         tm = kzalloc(sizeof(*tm), GFP_NOFS);
602         if (!tm) {
603                 ret = -ENOMEM;
604                 goto free_tms;
605         }
606
607         tm->logical = new_root->start;
608         tm->old_root.logical = old_root->start;
609         tm->old_root.level = btrfs_header_level(old_root);
610         tm->generation = btrfs_header_generation(old_root);
611         tm->op = MOD_LOG_ROOT_REPLACE;
612
613         if (tree_mod_dont_log(fs_info, NULL))
614                 goto free_tms;
615
616         if (tm_list)
617                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
618         if (!ret)
619                 ret = __tree_mod_log_insert(fs_info, tm);
620
621         write_unlock(&fs_info->tree_mod_log_lock);
622         if (ret)
623                 goto free_tms;
624         kfree(tm_list);
625
626         return ret;
627
628 free_tms:
629         if (tm_list) {
630                 for (i = 0; i < nritems; i++)
631                         kfree(tm_list[i]);
632                 kfree(tm_list);
633         }
634         kfree(tm);
635
636         return ret;
637 }
638
639 static struct tree_mod_elem *
640 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
641                       int smallest)
642 {
643         struct rb_root *tm_root;
644         struct rb_node *node;
645         struct tree_mod_elem *cur = NULL;
646         struct tree_mod_elem *found = NULL;
647
648         read_lock(&fs_info->tree_mod_log_lock);
649         tm_root = &fs_info->tree_mod_log;
650         node = tm_root->rb_node;
651         while (node) {
652                 cur = rb_entry(node, struct tree_mod_elem, node);
653                 if (cur->logical < start) {
654                         node = node->rb_left;
655                 } else if (cur->logical > start) {
656                         node = node->rb_right;
657                 } else if (cur->seq < min_seq) {
658                         node = node->rb_left;
659                 } else if (!smallest) {
660                         /* we want the node with the highest seq */
661                         if (found)
662                                 BUG_ON(found->seq > cur->seq);
663                         found = cur;
664                         node = node->rb_left;
665                 } else if (cur->seq > min_seq) {
666                         /* we want the node with the smallest seq */
667                         if (found)
668                                 BUG_ON(found->seq < cur->seq);
669                         found = cur;
670                         node = node->rb_right;
671                 } else {
672                         found = cur;
673                         break;
674                 }
675         }
676         read_unlock(&fs_info->tree_mod_log_lock);
677
678         return found;
679 }
680
681 /*
682  * this returns the element from the log with the smallest time sequence
683  * value that's in the log (the oldest log item). any element with a time
684  * sequence lower than min_seq will be ignored.
685  */
686 static struct tree_mod_elem *
687 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
688                            u64 min_seq)
689 {
690         return __tree_mod_log_search(fs_info, start, min_seq, 1);
691 }
692
693 /*
694  * this returns the element from the log with the largest time sequence
695  * value that's in the log (the most recent log item). any element with
696  * a time sequence lower than min_seq will be ignored.
697  */
698 static struct tree_mod_elem *
699 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
700 {
701         return __tree_mod_log_search(fs_info, start, min_seq, 0);
702 }
703
704 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
705                      struct extent_buffer *src, unsigned long dst_offset,
706                      unsigned long src_offset, int nr_items)
707 {
708         struct btrfs_fs_info *fs_info = dst->fs_info;
709         int ret = 0;
710         struct tree_mod_elem **tm_list = NULL;
711         struct tree_mod_elem **tm_list_add, **tm_list_rem;
712         int i;
713         int locked = 0;
714
715         if (!tree_mod_need_log(fs_info, NULL))
716                 return 0;
717
718         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
719                 return 0;
720
721         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
722                           GFP_NOFS);
723         if (!tm_list)
724                 return -ENOMEM;
725
726         tm_list_add = tm_list;
727         tm_list_rem = tm_list + nr_items;
728         for (i = 0; i < nr_items; i++) {
729                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
730                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
731                 if (!tm_list_rem[i]) {
732                         ret = -ENOMEM;
733                         goto free_tms;
734                 }
735
736                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
737                     MOD_LOG_KEY_ADD, GFP_NOFS);
738                 if (!tm_list_add[i]) {
739                         ret = -ENOMEM;
740                         goto free_tms;
741                 }
742         }
743
744         if (tree_mod_dont_log(fs_info, NULL))
745                 goto free_tms;
746         locked = 1;
747
748         for (i = 0; i < nr_items; i++) {
749                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
750                 if (ret)
751                         goto free_tms;
752                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
753                 if (ret)
754                         goto free_tms;
755         }
756
757         write_unlock(&fs_info->tree_mod_log_lock);
758         kfree(tm_list);
759
760         return 0;
761
762 free_tms:
763         for (i = 0; i < nr_items * 2; i++) {
764                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
765                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
766                 kfree(tm_list[i]);
767         }
768         if (locked)
769                 write_unlock(&fs_info->tree_mod_log_lock);
770         kfree(tm_list);
771
772         return ret;
773 }
774
775 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
776 {
777         struct tree_mod_elem **tm_list = NULL;
778         int nritems = 0;
779         int i;
780         int ret = 0;
781
782         if (btrfs_header_level(eb) == 0)
783                 return 0;
784
785         if (!tree_mod_need_log(eb->fs_info, NULL))
786                 return 0;
787
788         nritems = btrfs_header_nritems(eb);
789         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
790         if (!tm_list)
791                 return -ENOMEM;
792
793         for (i = 0; i < nritems; i++) {
794                 tm_list[i] = alloc_tree_mod_elem(eb, i,
795                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
796                 if (!tm_list[i]) {
797                         ret = -ENOMEM;
798                         goto free_tms;
799                 }
800         }
801
802         if (tree_mod_dont_log(eb->fs_info, eb))
803                 goto free_tms;
804
805         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
806         write_unlock(&eb->fs_info->tree_mod_log_lock);
807         if (ret)
808                 goto free_tms;
809         kfree(tm_list);
810
811         return 0;
812
813 free_tms:
814         for (i = 0; i < nritems; i++)
815                 kfree(tm_list[i]);
816         kfree(tm_list);
817
818         return ret;
819 }
820
821 /*
822  * check if the tree block can be shared by multiple trees
823  */
824 int btrfs_block_can_be_shared(struct btrfs_root *root,
825                               struct extent_buffer *buf)
826 {
827         /*
828          * Tree blocks not in reference counted trees and tree roots
829          * are never shared. If a block was allocated after the last
830          * snapshot and the block was not allocated by tree relocation,
831          * we know the block is not shared.
832          */
833         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
834             buf != root->node && buf != root->commit_root &&
835             (btrfs_header_generation(buf) <=
836              btrfs_root_last_snapshot(&root->root_item) ||
837              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
838                 return 1;
839
840         return 0;
841 }
842
843 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
844                                        struct btrfs_root *root,
845                                        struct extent_buffer *buf,
846                                        struct extent_buffer *cow,
847                                        int *last_ref)
848 {
849         struct btrfs_fs_info *fs_info = root->fs_info;
850         u64 refs;
851         u64 owner;
852         u64 flags;
853         u64 new_flags = 0;
854         int ret;
855
856         /*
857          * Backrefs update rules:
858          *
859          * Always use full backrefs for extent pointers in tree block
860          * allocated by tree relocation.
861          *
862          * If a shared tree block is no longer referenced by its owner
863          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
864          * use full backrefs for extent pointers in tree block.
865          *
866          * If a tree block is been relocating
867          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
868          * use full backrefs for extent pointers in tree block.
869          * The reason for this is some operations (such as drop tree)
870          * are only allowed for blocks use full backrefs.
871          */
872
873         if (btrfs_block_can_be_shared(root, buf)) {
874                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
875                                                btrfs_header_level(buf), 1,
876                                                &refs, &flags);
877                 if (ret)
878                         return ret;
879                 if (refs == 0) {
880                         ret = -EROFS;
881                         btrfs_handle_fs_error(fs_info, ret, NULL);
882                         return ret;
883                 }
884         } else {
885                 refs = 1;
886                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
887                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
888                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
889                 else
890                         flags = 0;
891         }
892
893         owner = btrfs_header_owner(buf);
894         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
895                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
896
897         if (refs > 1) {
898                 if ((owner == root->root_key.objectid ||
899                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
900                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
901                         ret = btrfs_inc_ref(trans, root, buf, 1);
902                         if (ret)
903                                 return ret;
904
905                         if (root->root_key.objectid ==
906                             BTRFS_TREE_RELOC_OBJECTID) {
907                                 ret = btrfs_dec_ref(trans, root, buf, 0);
908                                 if (ret)
909                                         return ret;
910                                 ret = btrfs_inc_ref(trans, root, cow, 1);
911                                 if (ret)
912                                         return ret;
913                         }
914                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
915                 } else {
916
917                         if (root->root_key.objectid ==
918                             BTRFS_TREE_RELOC_OBJECTID)
919                                 ret = btrfs_inc_ref(trans, root, cow, 1);
920                         else
921                                 ret = btrfs_inc_ref(trans, root, cow, 0);
922                         if (ret)
923                                 return ret;
924                 }
925                 if (new_flags != 0) {
926                         int level = btrfs_header_level(buf);
927
928                         ret = btrfs_set_disk_extent_flags(trans,
929                                                           buf->start,
930                                                           buf->len,
931                                                           new_flags, level, 0);
932                         if (ret)
933                                 return ret;
934                 }
935         } else {
936                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
937                         if (root->root_key.objectid ==
938                             BTRFS_TREE_RELOC_OBJECTID)
939                                 ret = btrfs_inc_ref(trans, root, cow, 1);
940                         else
941                                 ret = btrfs_inc_ref(trans, root, cow, 0);
942                         if (ret)
943                                 return ret;
944                         ret = btrfs_dec_ref(trans, root, buf, 1);
945                         if (ret)
946                                 return ret;
947                 }
948                 btrfs_clean_tree_block(buf);
949                 *last_ref = 1;
950         }
951         return 0;
952 }
953
954 static struct extent_buffer *alloc_tree_block_no_bg_flush(
955                                           struct btrfs_trans_handle *trans,
956                                           struct btrfs_root *root,
957                                           u64 parent_start,
958                                           const struct btrfs_disk_key *disk_key,
959                                           int level,
960                                           u64 hint,
961                                           u64 empty_size)
962 {
963         struct btrfs_fs_info *fs_info = root->fs_info;
964         struct extent_buffer *ret;
965
966         /*
967          * If we are COWing a node/leaf from the extent, chunk, device or free
968          * space trees, make sure that we do not finish block group creation of
969          * pending block groups. We do this to avoid a deadlock.
970          * COWing can result in allocation of a new chunk, and flushing pending
971          * block groups (btrfs_create_pending_block_groups()) can be triggered
972          * when finishing allocation of a new chunk. Creation of a pending block
973          * group modifies the extent, chunk, device and free space trees,
974          * therefore we could deadlock with ourselves since we are holding a
975          * lock on an extent buffer that btrfs_create_pending_block_groups() may
976          * try to COW later.
977          * For similar reasons, we also need to delay flushing pending block
978          * groups when splitting a leaf or node, from one of those trees, since
979          * we are holding a write lock on it and its parent or when inserting a
980          * new root node for one of those trees.
981          */
982         if (root == fs_info->extent_root ||
983             root == fs_info->chunk_root ||
984             root == fs_info->dev_root ||
985             root == fs_info->free_space_root)
986                 trans->can_flush_pending_bgs = false;
987
988         ret = btrfs_alloc_tree_block(trans, root, parent_start,
989                                      root->root_key.objectid, disk_key, level,
990                                      hint, empty_size);
991         trans->can_flush_pending_bgs = true;
992
993         return ret;
994 }
995
996 /*
997  * does the dirty work in cow of a single block.  The parent block (if
998  * supplied) is updated to point to the new cow copy.  The new buffer is marked
999  * dirty and returned locked.  If you modify the block it needs to be marked
1000  * dirty again.
1001  *
1002  * search_start -- an allocation hint for the new block
1003  *
1004  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1005  * bytes the allocator should try to find free next to the block it returns.
1006  * This is just a hint and may be ignored by the allocator.
1007  */
1008 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1009                              struct btrfs_root *root,
1010                              struct extent_buffer *buf,
1011                              struct extent_buffer *parent, int parent_slot,
1012                              struct extent_buffer **cow_ret,
1013                              u64 search_start, u64 empty_size)
1014 {
1015         struct btrfs_fs_info *fs_info = root->fs_info;
1016         struct btrfs_disk_key disk_key;
1017         struct extent_buffer *cow;
1018         int level, ret;
1019         int last_ref = 0;
1020         int unlock_orig = 0;
1021         u64 parent_start = 0;
1022
1023         if (*cow_ret == buf)
1024                 unlock_orig = 1;
1025
1026         btrfs_assert_tree_locked(buf);
1027
1028         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1029                 trans->transid != fs_info->running_transaction->transid);
1030         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1031                 trans->transid != root->last_trans);
1032
1033         level = btrfs_header_level(buf);
1034
1035         if (level == 0)
1036                 btrfs_item_key(buf, &disk_key, 0);
1037         else
1038                 btrfs_node_key(buf, &disk_key, 0);
1039
1040         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1041                 parent_start = parent->start;
1042
1043         cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1044                                            level, search_start, empty_size);
1045         if (IS_ERR(cow))
1046                 return PTR_ERR(cow);
1047
1048         /* cow is set to blocking by btrfs_init_new_buffer */
1049
1050         copy_extent_buffer_full(cow, buf);
1051         btrfs_set_header_bytenr(cow, cow->start);
1052         btrfs_set_header_generation(cow, trans->transid);
1053         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1054         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1055                                      BTRFS_HEADER_FLAG_RELOC);
1056         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1057                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1058         else
1059                 btrfs_set_header_owner(cow, root->root_key.objectid);
1060
1061         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1062
1063         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1064         if (ret) {
1065                 btrfs_abort_transaction(trans, ret);
1066                 return ret;
1067         }
1068
1069         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1070                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1071                 if (ret) {
1072                         btrfs_abort_transaction(trans, ret);
1073                         return ret;
1074                 }
1075         }
1076
1077         if (buf == root->node) {
1078                 WARN_ON(parent && parent != buf);
1079                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1080                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1081                         parent_start = buf->start;
1082
1083                 atomic_inc(&cow->refs);
1084                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1085                 BUG_ON(ret < 0);
1086                 rcu_assign_pointer(root->node, cow);
1087
1088                 btrfs_free_tree_block(trans, root, buf, parent_start,
1089                                       last_ref);
1090                 free_extent_buffer(buf);
1091                 add_root_to_dirty_list(root);
1092         } else {
1093                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1094                 tree_mod_log_insert_key(parent, parent_slot,
1095                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1096                 btrfs_set_node_blockptr(parent, parent_slot,
1097                                         cow->start);
1098                 btrfs_set_node_ptr_generation(parent, parent_slot,
1099                                               trans->transid);
1100                 btrfs_mark_buffer_dirty(parent);
1101                 if (last_ref) {
1102                         ret = tree_mod_log_free_eb(buf);
1103                         if (ret) {
1104                                 btrfs_abort_transaction(trans, ret);
1105                                 return ret;
1106                         }
1107                 }
1108                 btrfs_free_tree_block(trans, root, buf, parent_start,
1109                                       last_ref);
1110         }
1111         if (unlock_orig)
1112                 btrfs_tree_unlock(buf);
1113         free_extent_buffer_stale(buf);
1114         btrfs_mark_buffer_dirty(cow);
1115         *cow_ret = cow;
1116         return 0;
1117 }
1118
1119 /*
1120  * returns the logical address of the oldest predecessor of the given root.
1121  * entries older than time_seq are ignored.
1122  */
1123 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1124                 struct extent_buffer *eb_root, u64 time_seq)
1125 {
1126         struct tree_mod_elem *tm;
1127         struct tree_mod_elem *found = NULL;
1128         u64 root_logical = eb_root->start;
1129         int looped = 0;
1130
1131         if (!time_seq)
1132                 return NULL;
1133
1134         /*
1135          * the very last operation that's logged for a root is the
1136          * replacement operation (if it is replaced at all). this has
1137          * the logical address of the *new* root, making it the very
1138          * first operation that's logged for this root.
1139          */
1140         while (1) {
1141                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1142                                                 time_seq);
1143                 if (!looped && !tm)
1144                         return NULL;
1145                 /*
1146                  * if there are no tree operation for the oldest root, we simply
1147                  * return it. this should only happen if that (old) root is at
1148                  * level 0.
1149                  */
1150                 if (!tm)
1151                         break;
1152
1153                 /*
1154                  * if there's an operation that's not a root replacement, we
1155                  * found the oldest version of our root. normally, we'll find a
1156                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1157                  */
1158                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1159                         break;
1160
1161                 found = tm;
1162                 root_logical = tm->old_root.logical;
1163                 looped = 1;
1164         }
1165
1166         /* if there's no old root to return, return what we found instead */
1167         if (!found)
1168                 found = tm;
1169
1170         return found;
1171 }
1172
1173 /*
1174  * tm is a pointer to the first operation to rewind within eb. then, all
1175  * previous operations will be rewound (until we reach something older than
1176  * time_seq).
1177  */
1178 static void
1179 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1180                       u64 time_seq, struct tree_mod_elem *first_tm)
1181 {
1182         u32 n;
1183         struct rb_node *next;
1184         struct tree_mod_elem *tm = first_tm;
1185         unsigned long o_dst;
1186         unsigned long o_src;
1187         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1188
1189         n = btrfs_header_nritems(eb);
1190         read_lock(&fs_info->tree_mod_log_lock);
1191         while (tm && tm->seq >= time_seq) {
1192                 /*
1193                  * all the operations are recorded with the operator used for
1194                  * the modification. as we're going backwards, we do the
1195                  * opposite of each operation here.
1196                  */
1197                 switch (tm->op) {
1198                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1199                         BUG_ON(tm->slot < n);
1200                         /* Fallthrough */
1201                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1202                 case MOD_LOG_KEY_REMOVE:
1203                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1204                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1205                         btrfs_set_node_ptr_generation(eb, tm->slot,
1206                                                       tm->generation);
1207                         n++;
1208                         break;
1209                 case MOD_LOG_KEY_REPLACE:
1210                         BUG_ON(tm->slot >= n);
1211                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1212                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1213                         btrfs_set_node_ptr_generation(eb, tm->slot,
1214                                                       tm->generation);
1215                         break;
1216                 case MOD_LOG_KEY_ADD:
1217                         /* if a move operation is needed it's in the log */
1218                         n--;
1219                         break;
1220                 case MOD_LOG_MOVE_KEYS:
1221                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1222                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1223                         memmove_extent_buffer(eb, o_dst, o_src,
1224                                               tm->move.nr_items * p_size);
1225                         break;
1226                 case MOD_LOG_ROOT_REPLACE:
1227                         /*
1228                          * this operation is special. for roots, this must be
1229                          * handled explicitly before rewinding.
1230                          * for non-roots, this operation may exist if the node
1231                          * was a root: root A -> child B; then A gets empty and
1232                          * B is promoted to the new root. in the mod log, we'll
1233                          * have a root-replace operation for B, a tree block
1234                          * that is no root. we simply ignore that operation.
1235                          */
1236                         break;
1237                 }
1238                 next = rb_next(&tm->node);
1239                 if (!next)
1240                         break;
1241                 tm = rb_entry(next, struct tree_mod_elem, node);
1242                 if (tm->logical != first_tm->logical)
1243                         break;
1244         }
1245         read_unlock(&fs_info->tree_mod_log_lock);
1246         btrfs_set_header_nritems(eb, n);
1247 }
1248
1249 /*
1250  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1251  * is returned. If rewind operations happen, a fresh buffer is returned. The
1252  * returned buffer is always read-locked. If the returned buffer is not the
1253  * input buffer, the lock on the input buffer is released and the input buffer
1254  * is freed (its refcount is decremented).
1255  */
1256 static struct extent_buffer *
1257 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1258                     struct extent_buffer *eb, u64 time_seq)
1259 {
1260         struct extent_buffer *eb_rewin;
1261         struct tree_mod_elem *tm;
1262
1263         if (!time_seq)
1264                 return eb;
1265
1266         if (btrfs_header_level(eb) == 0)
1267                 return eb;
1268
1269         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1270         if (!tm)
1271                 return eb;
1272
1273         btrfs_set_path_blocking(path);
1274         btrfs_set_lock_blocking_read(eb);
1275
1276         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1277                 BUG_ON(tm->slot != 0);
1278                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1279                 if (!eb_rewin) {
1280                         btrfs_tree_read_unlock_blocking(eb);
1281                         free_extent_buffer(eb);
1282                         return NULL;
1283                 }
1284                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1285                 btrfs_set_header_backref_rev(eb_rewin,
1286                                              btrfs_header_backref_rev(eb));
1287                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1288                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1289         } else {
1290                 eb_rewin = btrfs_clone_extent_buffer(eb);
1291                 if (!eb_rewin) {
1292                         btrfs_tree_read_unlock_blocking(eb);
1293                         free_extent_buffer(eb);
1294                         return NULL;
1295                 }
1296         }
1297
1298         btrfs_tree_read_unlock_blocking(eb);
1299         free_extent_buffer(eb);
1300
1301         btrfs_tree_read_lock(eb_rewin);
1302         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1303         WARN_ON(btrfs_header_nritems(eb_rewin) >
1304                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1305
1306         return eb_rewin;
1307 }
1308
1309 /*
1310  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1311  * value. If there are no changes, the current root->root_node is returned. If
1312  * anything changed in between, there's a fresh buffer allocated on which the
1313  * rewind operations are done. In any case, the returned buffer is read locked.
1314  * Returns NULL on error (with no locks held).
1315  */
1316 static inline struct extent_buffer *
1317 get_old_root(struct btrfs_root *root, u64 time_seq)
1318 {
1319         struct btrfs_fs_info *fs_info = root->fs_info;
1320         struct tree_mod_elem *tm;
1321         struct extent_buffer *eb = NULL;
1322         struct extent_buffer *eb_root;
1323         u64 eb_root_owner = 0;
1324         struct extent_buffer *old;
1325         struct tree_mod_root *old_root = NULL;
1326         u64 old_generation = 0;
1327         u64 logical;
1328         int level;
1329
1330         eb_root = btrfs_read_lock_root_node(root);
1331         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1332         if (!tm)
1333                 return eb_root;
1334
1335         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1336                 old_root = &tm->old_root;
1337                 old_generation = tm->generation;
1338                 logical = old_root->logical;
1339                 level = old_root->level;
1340         } else {
1341                 logical = eb_root->start;
1342                 level = btrfs_header_level(eb_root);
1343         }
1344
1345         tm = tree_mod_log_search(fs_info, logical, time_seq);
1346         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1347                 btrfs_tree_read_unlock(eb_root);
1348                 free_extent_buffer(eb_root);
1349                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1350                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1351                         if (!IS_ERR(old))
1352                                 free_extent_buffer(old);
1353                         btrfs_warn(fs_info,
1354                                    "failed to read tree block %llu from get_old_root",
1355                                    logical);
1356                 } else {
1357                         eb = btrfs_clone_extent_buffer(old);
1358                         free_extent_buffer(old);
1359                 }
1360         } else if (old_root) {
1361                 eb_root_owner = btrfs_header_owner(eb_root);
1362                 btrfs_tree_read_unlock(eb_root);
1363                 free_extent_buffer(eb_root);
1364                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1365         } else {
1366                 btrfs_set_lock_blocking_read(eb_root);
1367                 eb = btrfs_clone_extent_buffer(eb_root);
1368                 btrfs_tree_read_unlock_blocking(eb_root);
1369                 free_extent_buffer(eb_root);
1370         }
1371
1372         if (!eb)
1373                 return NULL;
1374         btrfs_tree_read_lock(eb);
1375         if (old_root) {
1376                 btrfs_set_header_bytenr(eb, eb->start);
1377                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1378                 btrfs_set_header_owner(eb, eb_root_owner);
1379                 btrfs_set_header_level(eb, old_root->level);
1380                 btrfs_set_header_generation(eb, old_generation);
1381         }
1382         if (tm)
1383                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1384         else
1385                 WARN_ON(btrfs_header_level(eb) != 0);
1386         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1387
1388         return eb;
1389 }
1390
1391 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1392 {
1393         struct tree_mod_elem *tm;
1394         int level;
1395         struct extent_buffer *eb_root = btrfs_root_node(root);
1396
1397         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1398         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1399                 level = tm->old_root.level;
1400         } else {
1401                 level = btrfs_header_level(eb_root);
1402         }
1403         free_extent_buffer(eb_root);
1404
1405         return level;
1406 }
1407
1408 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1409                                    struct btrfs_root *root,
1410                                    struct extent_buffer *buf)
1411 {
1412         if (btrfs_is_testing(root->fs_info))
1413                 return 0;
1414
1415         /* Ensure we can see the FORCE_COW bit */
1416         smp_mb__before_atomic();
1417
1418         /*
1419          * We do not need to cow a block if
1420          * 1) this block is not created or changed in this transaction;
1421          * 2) this block does not belong to TREE_RELOC tree;
1422          * 3) the root is not forced COW.
1423          *
1424          * What is forced COW:
1425          *    when we create snapshot during committing the transaction,
1426          *    after we've finished copying src root, we must COW the shared
1427          *    block to ensure the metadata consistency.
1428          */
1429         if (btrfs_header_generation(buf) == trans->transid &&
1430             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1431             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1432               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1433             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1434                 return 0;
1435         return 1;
1436 }
1437
1438 /*
1439  * cows a single block, see __btrfs_cow_block for the real work.
1440  * This version of it has extra checks so that a block isn't COWed more than
1441  * once per transaction, as long as it hasn't been written yet
1442  */
1443 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1444                     struct btrfs_root *root, struct extent_buffer *buf,
1445                     struct extent_buffer *parent, int parent_slot,
1446                     struct extent_buffer **cow_ret)
1447 {
1448         struct btrfs_fs_info *fs_info = root->fs_info;
1449         u64 search_start;
1450         int ret;
1451
1452         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1453                 btrfs_err(fs_info,
1454                         "COW'ing blocks on a fs root that's being dropped");
1455
1456         if (trans->transaction != fs_info->running_transaction)
1457                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1458                        trans->transid,
1459                        fs_info->running_transaction->transid);
1460
1461         if (trans->transid != fs_info->generation)
1462                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1463                        trans->transid, fs_info->generation);
1464
1465         if (!should_cow_block(trans, root, buf)) {
1466                 trans->dirty = true;
1467                 *cow_ret = buf;
1468                 return 0;
1469         }
1470
1471         search_start = buf->start & ~((u64)SZ_1G - 1);
1472
1473         if (parent)
1474                 btrfs_set_lock_blocking_write(parent);
1475         btrfs_set_lock_blocking_write(buf);
1476
1477         /*
1478          * Before CoWing this block for later modification, check if it's
1479          * the subtree root and do the delayed subtree trace if needed.
1480          *
1481          * Also We don't care about the error, as it's handled internally.
1482          */
1483         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1484         ret = __btrfs_cow_block(trans, root, buf, parent,
1485                                  parent_slot, cow_ret, search_start, 0);
1486
1487         trace_btrfs_cow_block(root, buf, *cow_ret);
1488
1489         return ret;
1490 }
1491
1492 /*
1493  * helper function for defrag to decide if two blocks pointed to by a
1494  * node are actually close by
1495  */
1496 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1497 {
1498         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1499                 return 1;
1500         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1501                 return 1;
1502         return 0;
1503 }
1504
1505 /*
1506  * compare two keys in a memcmp fashion
1507  */
1508 static int comp_keys(const struct btrfs_disk_key *disk,
1509                      const struct btrfs_key *k2)
1510 {
1511         struct btrfs_key k1;
1512
1513         btrfs_disk_key_to_cpu(&k1, disk);
1514
1515         return btrfs_comp_cpu_keys(&k1, k2);
1516 }
1517
1518 /*
1519  * same as comp_keys only with two btrfs_key's
1520  */
1521 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1522 {
1523         if (k1->objectid > k2->objectid)
1524                 return 1;
1525         if (k1->objectid < k2->objectid)
1526                 return -1;
1527         if (k1->type > k2->type)
1528                 return 1;
1529         if (k1->type < k2->type)
1530                 return -1;
1531         if (k1->offset > k2->offset)
1532                 return 1;
1533         if (k1->offset < k2->offset)
1534                 return -1;
1535         return 0;
1536 }
1537
1538 /*
1539  * this is used by the defrag code to go through all the
1540  * leaves pointed to by a node and reallocate them so that
1541  * disk order is close to key order
1542  */
1543 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1544                        struct btrfs_root *root, struct extent_buffer *parent,
1545                        int start_slot, u64 *last_ret,
1546                        struct btrfs_key *progress)
1547 {
1548         struct btrfs_fs_info *fs_info = root->fs_info;
1549         struct extent_buffer *cur;
1550         u64 blocknr;
1551         u64 gen;
1552         u64 search_start = *last_ret;
1553         u64 last_block = 0;
1554         u64 other;
1555         u32 parent_nritems;
1556         int end_slot;
1557         int i;
1558         int err = 0;
1559         int parent_level;
1560         int uptodate;
1561         u32 blocksize;
1562         int progress_passed = 0;
1563         struct btrfs_disk_key disk_key;
1564
1565         parent_level = btrfs_header_level(parent);
1566
1567         WARN_ON(trans->transaction != fs_info->running_transaction);
1568         WARN_ON(trans->transid != fs_info->generation);
1569
1570         parent_nritems = btrfs_header_nritems(parent);
1571         blocksize = fs_info->nodesize;
1572         end_slot = parent_nritems - 1;
1573
1574         if (parent_nritems <= 1)
1575                 return 0;
1576
1577         btrfs_set_lock_blocking_write(parent);
1578
1579         for (i = start_slot; i <= end_slot; i++) {
1580                 struct btrfs_key first_key;
1581                 int close = 1;
1582
1583                 btrfs_node_key(parent, &disk_key, i);
1584                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1585                         continue;
1586
1587                 progress_passed = 1;
1588                 blocknr = btrfs_node_blockptr(parent, i);
1589                 gen = btrfs_node_ptr_generation(parent, i);
1590                 btrfs_node_key_to_cpu(parent, &first_key, i);
1591                 if (last_block == 0)
1592                         last_block = blocknr;
1593
1594                 if (i > 0) {
1595                         other = btrfs_node_blockptr(parent, i - 1);
1596                         close = close_blocks(blocknr, other, blocksize);
1597                 }
1598                 if (!close && i < end_slot) {
1599                         other = btrfs_node_blockptr(parent, i + 1);
1600                         close = close_blocks(blocknr, other, blocksize);
1601                 }
1602                 if (close) {
1603                         last_block = blocknr;
1604                         continue;
1605                 }
1606
1607                 cur = find_extent_buffer(fs_info, blocknr);
1608                 if (cur)
1609                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1610                 else
1611                         uptodate = 0;
1612                 if (!cur || !uptodate) {
1613                         if (!cur) {
1614                                 cur = read_tree_block(fs_info, blocknr, gen,
1615                                                       parent_level - 1,
1616                                                       &first_key);
1617                                 if (IS_ERR(cur)) {
1618                                         return PTR_ERR(cur);
1619                                 } else if (!extent_buffer_uptodate(cur)) {
1620                                         free_extent_buffer(cur);
1621                                         return -EIO;
1622                                 }
1623                         } else if (!uptodate) {
1624                                 err = btrfs_read_buffer(cur, gen,
1625                                                 parent_level - 1,&first_key);
1626                                 if (err) {
1627                                         free_extent_buffer(cur);
1628                                         return err;
1629                                 }
1630                         }
1631                 }
1632                 if (search_start == 0)
1633                         search_start = last_block;
1634
1635                 btrfs_tree_lock(cur);
1636                 btrfs_set_lock_blocking_write(cur);
1637                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1638                                         &cur, search_start,
1639                                         min(16 * blocksize,
1640                                             (end_slot - i) * blocksize));
1641                 if (err) {
1642                         btrfs_tree_unlock(cur);
1643                         free_extent_buffer(cur);
1644                         break;
1645                 }
1646                 search_start = cur->start;
1647                 last_block = cur->start;
1648                 *last_ret = search_start;
1649                 btrfs_tree_unlock(cur);
1650                 free_extent_buffer(cur);
1651         }
1652         return err;
1653 }
1654
1655 /*
1656  * search for key in the extent_buffer.  The items start at offset p,
1657  * and they are item_size apart.  There are 'max' items in p.
1658  *
1659  * the slot in the array is returned via slot, and it points to
1660  * the place where you would insert key if it is not found in
1661  * the array.
1662  *
1663  * slot may point to max if the key is bigger than all of the keys
1664  */
1665 static noinline int generic_bin_search(struct extent_buffer *eb,
1666                                        unsigned long p, int item_size,
1667                                        const struct btrfs_key *key,
1668                                        int max, int *slot)
1669 {
1670         int low = 0;
1671         int high = max;
1672         int mid;
1673         int ret;
1674         struct btrfs_disk_key *tmp = NULL;
1675         struct btrfs_disk_key unaligned;
1676         unsigned long offset;
1677         char *kaddr = NULL;
1678         unsigned long map_start = 0;
1679         unsigned long map_len = 0;
1680         int err;
1681
1682         if (low > high) {
1683                 btrfs_err(eb->fs_info,
1684                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1685                           __func__, low, high, eb->start,
1686                           btrfs_header_owner(eb), btrfs_header_level(eb));
1687                 return -EINVAL;
1688         }
1689
1690         while (low < high) {
1691                 mid = (low + high) / 2;
1692                 offset = p + mid * item_size;
1693
1694                 if (!kaddr || offset < map_start ||
1695                     (offset + sizeof(struct btrfs_disk_key)) >
1696                     map_start + map_len) {
1697
1698                         err = map_private_extent_buffer(eb, offset,
1699                                                 sizeof(struct btrfs_disk_key),
1700                                                 &kaddr, &map_start, &map_len);
1701
1702                         if (!err) {
1703                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1704                                                         map_start);
1705                         } else if (err == 1) {
1706                                 read_extent_buffer(eb, &unaligned,
1707                                                    offset, sizeof(unaligned));
1708                                 tmp = &unaligned;
1709                         } else {
1710                                 return err;
1711                         }
1712
1713                 } else {
1714                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1715                                                         map_start);
1716                 }
1717                 ret = comp_keys(tmp, key);
1718
1719                 if (ret < 0)
1720                         low = mid + 1;
1721                 else if (ret > 0)
1722                         high = mid;
1723                 else {
1724                         *slot = mid;
1725                         return 0;
1726                 }
1727         }
1728         *slot = low;
1729         return 1;
1730 }
1731
1732 /*
1733  * simple bin_search frontend that does the right thing for
1734  * leaves vs nodes
1735  */
1736 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1737                      int level, int *slot)
1738 {
1739         if (level == 0)
1740                 return generic_bin_search(eb,
1741                                           offsetof(struct btrfs_leaf, items),
1742                                           sizeof(struct btrfs_item),
1743                                           key, btrfs_header_nritems(eb),
1744                                           slot);
1745         else
1746                 return generic_bin_search(eb,
1747                                           offsetof(struct btrfs_node, ptrs),
1748                                           sizeof(struct btrfs_key_ptr),
1749                                           key, btrfs_header_nritems(eb),
1750                                           slot);
1751 }
1752
1753 static void root_add_used(struct btrfs_root *root, u32 size)
1754 {
1755         spin_lock(&root->accounting_lock);
1756         btrfs_set_root_used(&root->root_item,
1757                             btrfs_root_used(&root->root_item) + size);
1758         spin_unlock(&root->accounting_lock);
1759 }
1760
1761 static void root_sub_used(struct btrfs_root *root, u32 size)
1762 {
1763         spin_lock(&root->accounting_lock);
1764         btrfs_set_root_used(&root->root_item,
1765                             btrfs_root_used(&root->root_item) - size);
1766         spin_unlock(&root->accounting_lock);
1767 }
1768
1769 /* given a node and slot number, this reads the blocks it points to.  The
1770  * extent buffer is returned with a reference taken (but unlocked).
1771  */
1772 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1773                                            int slot)
1774 {
1775         int level = btrfs_header_level(parent);
1776         struct extent_buffer *eb;
1777         struct btrfs_key first_key;
1778
1779         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1780                 return ERR_PTR(-ENOENT);
1781
1782         BUG_ON(level == 0);
1783
1784         btrfs_node_key_to_cpu(parent, &first_key, slot);
1785         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1786                              btrfs_node_ptr_generation(parent, slot),
1787                              level - 1, &first_key);
1788         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1789                 free_extent_buffer(eb);
1790                 eb = ERR_PTR(-EIO);
1791         }
1792
1793         return eb;
1794 }
1795
1796 /*
1797  * node level balancing, used to make sure nodes are in proper order for
1798  * item deletion.  We balance from the top down, so we have to make sure
1799  * that a deletion won't leave an node completely empty later on.
1800  */
1801 static noinline int balance_level(struct btrfs_trans_handle *trans,
1802                          struct btrfs_root *root,
1803                          struct btrfs_path *path, int level)
1804 {
1805         struct btrfs_fs_info *fs_info = root->fs_info;
1806         struct extent_buffer *right = NULL;
1807         struct extent_buffer *mid;
1808         struct extent_buffer *left = NULL;
1809         struct extent_buffer *parent = NULL;
1810         int ret = 0;
1811         int wret;
1812         int pslot;
1813         int orig_slot = path->slots[level];
1814         u64 orig_ptr;
1815
1816         ASSERT(level > 0);
1817
1818         mid = path->nodes[level];
1819
1820         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1821                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1822         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1823
1824         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1825
1826         if (level < BTRFS_MAX_LEVEL - 1) {
1827                 parent = path->nodes[level + 1];
1828                 pslot = path->slots[level + 1];
1829         }
1830
1831         /*
1832          * deal with the case where there is only one pointer in the root
1833          * by promoting the node below to a root
1834          */
1835         if (!parent) {
1836                 struct extent_buffer *child;
1837
1838                 if (btrfs_header_nritems(mid) != 1)
1839                         return 0;
1840
1841                 /* promote the child to a root */
1842                 child = btrfs_read_node_slot(mid, 0);
1843                 if (IS_ERR(child)) {
1844                         ret = PTR_ERR(child);
1845                         btrfs_handle_fs_error(fs_info, ret, NULL);
1846                         goto enospc;
1847                 }
1848
1849                 btrfs_tree_lock(child);
1850                 btrfs_set_lock_blocking_write(child);
1851                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1852                 if (ret) {
1853                         btrfs_tree_unlock(child);
1854                         free_extent_buffer(child);
1855                         goto enospc;
1856                 }
1857
1858                 ret = tree_mod_log_insert_root(root->node, child, 1);
1859                 BUG_ON(ret < 0);
1860                 rcu_assign_pointer(root->node, child);
1861
1862                 add_root_to_dirty_list(root);
1863                 btrfs_tree_unlock(child);
1864
1865                 path->locks[level] = 0;
1866                 path->nodes[level] = NULL;
1867                 btrfs_clean_tree_block(mid);
1868                 btrfs_tree_unlock(mid);
1869                 /* once for the path */
1870                 free_extent_buffer(mid);
1871
1872                 root_sub_used(root, mid->len);
1873                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1874                 /* once for the root ptr */
1875                 free_extent_buffer_stale(mid);
1876                 return 0;
1877         }
1878         if (btrfs_header_nritems(mid) >
1879             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1880                 return 0;
1881
1882         left = btrfs_read_node_slot(parent, pslot - 1);
1883         if (IS_ERR(left))
1884                 left = NULL;
1885
1886         if (left) {
1887                 btrfs_tree_lock(left);
1888                 btrfs_set_lock_blocking_write(left);
1889                 wret = btrfs_cow_block(trans, root, left,
1890                                        parent, pslot - 1, &left);
1891                 if (wret) {
1892                         ret = wret;
1893                         goto enospc;
1894                 }
1895         }
1896
1897         right = btrfs_read_node_slot(parent, pslot + 1);
1898         if (IS_ERR(right))
1899                 right = NULL;
1900
1901         if (right) {
1902                 btrfs_tree_lock(right);
1903                 btrfs_set_lock_blocking_write(right);
1904                 wret = btrfs_cow_block(trans, root, right,
1905                                        parent, pslot + 1, &right);
1906                 if (wret) {
1907                         ret = wret;
1908                         goto enospc;
1909                 }
1910         }
1911
1912         /* first, try to make some room in the middle buffer */
1913         if (left) {
1914                 orig_slot += btrfs_header_nritems(left);
1915                 wret = push_node_left(trans, left, mid, 1);
1916                 if (wret < 0)
1917                         ret = wret;
1918         }
1919
1920         /*
1921          * then try to empty the right most buffer into the middle
1922          */
1923         if (right) {
1924                 wret = push_node_left(trans, mid, right, 1);
1925                 if (wret < 0 && wret != -ENOSPC)
1926                         ret = wret;
1927                 if (btrfs_header_nritems(right) == 0) {
1928                         btrfs_clean_tree_block(right);
1929                         btrfs_tree_unlock(right);
1930                         del_ptr(root, path, level + 1, pslot + 1);
1931                         root_sub_used(root, right->len);
1932                         btrfs_free_tree_block(trans, root, right, 0, 1);
1933                         free_extent_buffer_stale(right);
1934                         right = NULL;
1935                 } else {
1936                         struct btrfs_disk_key right_key;
1937                         btrfs_node_key(right, &right_key, 0);
1938                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1939                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1940                         BUG_ON(ret < 0);
1941                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1942                         btrfs_mark_buffer_dirty(parent);
1943                 }
1944         }
1945         if (btrfs_header_nritems(mid) == 1) {
1946                 /*
1947                  * we're not allowed to leave a node with one item in the
1948                  * tree during a delete.  A deletion from lower in the tree
1949                  * could try to delete the only pointer in this node.
1950                  * So, pull some keys from the left.
1951                  * There has to be a left pointer at this point because
1952                  * otherwise we would have pulled some pointers from the
1953                  * right
1954                  */
1955                 if (!left) {
1956                         ret = -EROFS;
1957                         btrfs_handle_fs_error(fs_info, ret, NULL);
1958                         goto enospc;
1959                 }
1960                 wret = balance_node_right(trans, mid, left);
1961                 if (wret < 0) {
1962                         ret = wret;
1963                         goto enospc;
1964                 }
1965                 if (wret == 1) {
1966                         wret = push_node_left(trans, left, mid, 1);
1967                         if (wret < 0)
1968                                 ret = wret;
1969                 }
1970                 BUG_ON(wret == 1);
1971         }
1972         if (btrfs_header_nritems(mid) == 0) {
1973                 btrfs_clean_tree_block(mid);
1974                 btrfs_tree_unlock(mid);
1975                 del_ptr(root, path, level + 1, pslot);
1976                 root_sub_used(root, mid->len);
1977                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1978                 free_extent_buffer_stale(mid);
1979                 mid = NULL;
1980         } else {
1981                 /* update the parent key to reflect our changes */
1982                 struct btrfs_disk_key mid_key;
1983                 btrfs_node_key(mid, &mid_key, 0);
1984                 ret = tree_mod_log_insert_key(parent, pslot,
1985                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1986                 BUG_ON(ret < 0);
1987                 btrfs_set_node_key(parent, &mid_key, pslot);
1988                 btrfs_mark_buffer_dirty(parent);
1989         }
1990
1991         /* update the path */
1992         if (left) {
1993                 if (btrfs_header_nritems(left) > orig_slot) {
1994                         atomic_inc(&left->refs);
1995                         /* left was locked after cow */
1996                         path->nodes[level] = left;
1997                         path->slots[level + 1] -= 1;
1998                         path->slots[level] = orig_slot;
1999                         if (mid) {
2000                                 btrfs_tree_unlock(mid);
2001                                 free_extent_buffer(mid);
2002                         }
2003                 } else {
2004                         orig_slot -= btrfs_header_nritems(left);
2005                         path->slots[level] = orig_slot;
2006                 }
2007         }
2008         /* double check we haven't messed things up */
2009         if (orig_ptr !=
2010             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2011                 BUG();
2012 enospc:
2013         if (right) {
2014                 btrfs_tree_unlock(right);
2015                 free_extent_buffer(right);
2016         }
2017         if (left) {
2018                 if (path->nodes[level] != left)
2019                         btrfs_tree_unlock(left);
2020                 free_extent_buffer(left);
2021         }
2022         return ret;
2023 }
2024
2025 /* Node balancing for insertion.  Here we only split or push nodes around
2026  * when they are completely full.  This is also done top down, so we
2027  * have to be pessimistic.
2028  */
2029 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2030                                           struct btrfs_root *root,
2031                                           struct btrfs_path *path, int level)
2032 {
2033         struct btrfs_fs_info *fs_info = root->fs_info;
2034         struct extent_buffer *right = NULL;
2035         struct extent_buffer *mid;
2036         struct extent_buffer *left = NULL;
2037         struct extent_buffer *parent = NULL;
2038         int ret = 0;
2039         int wret;
2040         int pslot;
2041         int orig_slot = path->slots[level];
2042
2043         if (level == 0)
2044                 return 1;
2045
2046         mid = path->nodes[level];
2047         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2048
2049         if (level < BTRFS_MAX_LEVEL - 1) {
2050                 parent = path->nodes[level + 1];
2051                 pslot = path->slots[level + 1];
2052         }
2053
2054         if (!parent)
2055                 return 1;
2056
2057         left = btrfs_read_node_slot(parent, pslot - 1);
2058         if (IS_ERR(left))
2059                 left = NULL;
2060
2061         /* first, try to make some room in the middle buffer */
2062         if (left) {
2063                 u32 left_nr;
2064
2065                 btrfs_tree_lock(left);
2066                 btrfs_set_lock_blocking_write(left);
2067
2068                 left_nr = btrfs_header_nritems(left);
2069                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2070                         wret = 1;
2071                 } else {
2072                         ret = btrfs_cow_block(trans, root, left, parent,
2073                                               pslot - 1, &left);
2074                         if (ret)
2075                                 wret = 1;
2076                         else {
2077                                 wret = push_node_left(trans, left, mid, 0);
2078                         }
2079                 }
2080                 if (wret < 0)
2081                         ret = wret;
2082                 if (wret == 0) {
2083                         struct btrfs_disk_key disk_key;
2084                         orig_slot += left_nr;
2085                         btrfs_node_key(mid, &disk_key, 0);
2086                         ret = tree_mod_log_insert_key(parent, pslot,
2087                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2088                         BUG_ON(ret < 0);
2089                         btrfs_set_node_key(parent, &disk_key, pslot);
2090                         btrfs_mark_buffer_dirty(parent);
2091                         if (btrfs_header_nritems(left) > orig_slot) {
2092                                 path->nodes[level] = left;
2093                                 path->slots[level + 1] -= 1;
2094                                 path->slots[level] = orig_slot;
2095                                 btrfs_tree_unlock(mid);
2096                                 free_extent_buffer(mid);
2097                         } else {
2098                                 orig_slot -=
2099                                         btrfs_header_nritems(left);
2100                                 path->slots[level] = orig_slot;
2101                                 btrfs_tree_unlock(left);
2102                                 free_extent_buffer(left);
2103                         }
2104                         return 0;
2105                 }
2106                 btrfs_tree_unlock(left);
2107                 free_extent_buffer(left);
2108         }
2109         right = btrfs_read_node_slot(parent, pslot + 1);
2110         if (IS_ERR(right))
2111                 right = NULL;
2112
2113         /*
2114          * then try to empty the right most buffer into the middle
2115          */
2116         if (right) {
2117                 u32 right_nr;
2118
2119                 btrfs_tree_lock(right);
2120                 btrfs_set_lock_blocking_write(right);
2121
2122                 right_nr = btrfs_header_nritems(right);
2123                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2124                         wret = 1;
2125                 } else {
2126                         ret = btrfs_cow_block(trans, root, right,
2127                                               parent, pslot + 1,
2128                                               &right);
2129                         if (ret)
2130                                 wret = 1;
2131                         else {
2132                                 wret = balance_node_right(trans, right, mid);
2133                         }
2134                 }
2135                 if (wret < 0)
2136                         ret = wret;
2137                 if (wret == 0) {
2138                         struct btrfs_disk_key disk_key;
2139
2140                         btrfs_node_key(right, &disk_key, 0);
2141                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2142                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2143                         BUG_ON(ret < 0);
2144                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2145                         btrfs_mark_buffer_dirty(parent);
2146
2147                         if (btrfs_header_nritems(mid) <= orig_slot) {
2148                                 path->nodes[level] = right;
2149                                 path->slots[level + 1] += 1;
2150                                 path->slots[level] = orig_slot -
2151                                         btrfs_header_nritems(mid);
2152                                 btrfs_tree_unlock(mid);
2153                                 free_extent_buffer(mid);
2154                         } else {
2155                                 btrfs_tree_unlock(right);
2156                                 free_extent_buffer(right);
2157                         }
2158                         return 0;
2159                 }
2160                 btrfs_tree_unlock(right);
2161                 free_extent_buffer(right);
2162         }
2163         return 1;
2164 }
2165
2166 /*
2167  * readahead one full node of leaves, finding things that are close
2168  * to the block in 'slot', and triggering ra on them.
2169  */
2170 static void reada_for_search(struct btrfs_fs_info *fs_info,
2171                              struct btrfs_path *path,
2172                              int level, int slot, u64 objectid)
2173 {
2174         struct extent_buffer *node;
2175         struct btrfs_disk_key disk_key;
2176         u32 nritems;
2177         u64 search;
2178         u64 target;
2179         u64 nread = 0;
2180         struct extent_buffer *eb;
2181         u32 nr;
2182         u32 blocksize;
2183         u32 nscan = 0;
2184
2185         if (level != 1)
2186                 return;
2187
2188         if (!path->nodes[level])
2189                 return;
2190
2191         node = path->nodes[level];
2192
2193         search = btrfs_node_blockptr(node, slot);
2194         blocksize = fs_info->nodesize;
2195         eb = find_extent_buffer(fs_info, search);
2196         if (eb) {
2197                 free_extent_buffer(eb);
2198                 return;
2199         }
2200
2201         target = search;
2202
2203         nritems = btrfs_header_nritems(node);
2204         nr = slot;
2205
2206         while (1) {
2207                 if (path->reada == READA_BACK) {
2208                         if (nr == 0)
2209                                 break;
2210                         nr--;
2211                 } else if (path->reada == READA_FORWARD) {
2212                         nr++;
2213                         if (nr >= nritems)
2214                                 break;
2215                 }
2216                 if (path->reada == READA_BACK && objectid) {
2217                         btrfs_node_key(node, &disk_key, nr);
2218                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2219                                 break;
2220                 }
2221                 search = btrfs_node_blockptr(node, nr);
2222                 if ((search <= target && target - search <= 65536) ||
2223                     (search > target && search - target <= 65536)) {
2224                         readahead_tree_block(fs_info, search);
2225                         nread += blocksize;
2226                 }
2227                 nscan++;
2228                 if ((nread > 65536 || nscan > 32))
2229                         break;
2230         }
2231 }
2232
2233 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2234                                        struct btrfs_path *path, int level)
2235 {
2236         int slot;
2237         int nritems;
2238         struct extent_buffer *parent;
2239         struct extent_buffer *eb;
2240         u64 gen;
2241         u64 block1 = 0;
2242         u64 block2 = 0;
2243
2244         parent = path->nodes[level + 1];
2245         if (!parent)
2246                 return;
2247
2248         nritems = btrfs_header_nritems(parent);
2249         slot = path->slots[level + 1];
2250
2251         if (slot > 0) {
2252                 block1 = btrfs_node_blockptr(parent, slot - 1);
2253                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2254                 eb = find_extent_buffer(fs_info, block1);
2255                 /*
2256                  * if we get -eagain from btrfs_buffer_uptodate, we
2257                  * don't want to return eagain here.  That will loop
2258                  * forever
2259                  */
2260                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2261                         block1 = 0;
2262                 free_extent_buffer(eb);
2263         }
2264         if (slot + 1 < nritems) {
2265                 block2 = btrfs_node_blockptr(parent, slot + 1);
2266                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2267                 eb = find_extent_buffer(fs_info, block2);
2268                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2269                         block2 = 0;
2270                 free_extent_buffer(eb);
2271         }
2272
2273         if (block1)
2274                 readahead_tree_block(fs_info, block1);
2275         if (block2)
2276                 readahead_tree_block(fs_info, block2);
2277 }
2278
2279
2280 /*
2281  * when we walk down the tree, it is usually safe to unlock the higher layers
2282  * in the tree.  The exceptions are when our path goes through slot 0, because
2283  * operations on the tree might require changing key pointers higher up in the
2284  * tree.
2285  *
2286  * callers might also have set path->keep_locks, which tells this code to keep
2287  * the lock if the path points to the last slot in the block.  This is part of
2288  * walking through the tree, and selecting the next slot in the higher block.
2289  *
2290  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2291  * if lowest_unlock is 1, level 0 won't be unlocked
2292  */
2293 static noinline void unlock_up(struct btrfs_path *path, int level,
2294                                int lowest_unlock, int min_write_lock_level,
2295                                int *write_lock_level)
2296 {
2297         int i;
2298         int skip_level = level;
2299         int no_skips = 0;
2300         struct extent_buffer *t;
2301
2302         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2303                 if (!path->nodes[i])
2304                         break;
2305                 if (!path->locks[i])
2306                         break;
2307                 if (!no_skips && path->slots[i] == 0) {
2308                         skip_level = i + 1;
2309                         continue;
2310                 }
2311                 if (!no_skips && path->keep_locks) {
2312                         u32 nritems;
2313                         t = path->nodes[i];
2314                         nritems = btrfs_header_nritems(t);
2315                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2316                                 skip_level = i + 1;
2317                                 continue;
2318                         }
2319                 }
2320                 if (skip_level < i && i >= lowest_unlock)
2321                         no_skips = 1;
2322
2323                 t = path->nodes[i];
2324                 if (i >= lowest_unlock && i > skip_level) {
2325                         btrfs_tree_unlock_rw(t, path->locks[i]);
2326                         path->locks[i] = 0;
2327                         if (write_lock_level &&
2328                             i > min_write_lock_level &&
2329                             i <= *write_lock_level) {
2330                                 *write_lock_level = i - 1;
2331                         }
2332                 }
2333         }
2334 }
2335
2336 /*
2337  * helper function for btrfs_search_slot.  The goal is to find a block
2338  * in cache without setting the path to blocking.  If we find the block
2339  * we return zero and the path is unchanged.
2340  *
2341  * If we can't find the block, we set the path blocking and do some
2342  * reada.  -EAGAIN is returned and the search must be repeated.
2343  */
2344 static int
2345 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2346                       struct extent_buffer **eb_ret, int level, int slot,
2347                       const struct btrfs_key *key)
2348 {
2349         struct btrfs_fs_info *fs_info = root->fs_info;
2350         u64 blocknr;
2351         u64 gen;
2352         struct extent_buffer *b = *eb_ret;
2353         struct extent_buffer *tmp;
2354         struct btrfs_key first_key;
2355         int ret;
2356         int parent_level;
2357
2358         blocknr = btrfs_node_blockptr(b, slot);
2359         gen = btrfs_node_ptr_generation(b, slot);
2360         parent_level = btrfs_header_level(b);
2361         btrfs_node_key_to_cpu(b, &first_key, slot);
2362
2363         tmp = find_extent_buffer(fs_info, blocknr);
2364         if (tmp) {
2365                 /* first we do an atomic uptodate check */
2366                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2367                         /*
2368                          * Do extra check for first_key, eb can be stale due to
2369                          * being cached, read from scrub, or have multiple
2370                          * parents (shared tree blocks).
2371                          */
2372                         if (btrfs_verify_level_key(tmp,
2373                                         parent_level - 1, &first_key, gen)) {
2374                                 free_extent_buffer(tmp);
2375                                 return -EUCLEAN;
2376                         }
2377                         *eb_ret = tmp;
2378                         return 0;
2379                 }
2380
2381                 /* the pages were up to date, but we failed
2382                  * the generation number check.  Do a full
2383                  * read for the generation number that is correct.
2384                  * We must do this without dropping locks so
2385                  * we can trust our generation number
2386                  */
2387                 btrfs_set_path_blocking(p);
2388
2389                 /* now we're allowed to do a blocking uptodate check */
2390                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2391                 if (!ret) {
2392                         *eb_ret = tmp;
2393                         return 0;
2394                 }
2395                 free_extent_buffer(tmp);
2396                 btrfs_release_path(p);
2397                 return -EIO;
2398         }
2399
2400         /*
2401          * reduce lock contention at high levels
2402          * of the btree by dropping locks before
2403          * we read.  Don't release the lock on the current
2404          * level because we need to walk this node to figure
2405          * out which blocks to read.
2406          */
2407         btrfs_unlock_up_safe(p, level + 1);
2408         btrfs_set_path_blocking(p);
2409
2410         if (p->reada != READA_NONE)
2411                 reada_for_search(fs_info, p, level, slot, key->objectid);
2412
2413         ret = -EAGAIN;
2414         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2415                               &first_key);
2416         if (!IS_ERR(tmp)) {
2417                 /*
2418                  * If the read above didn't mark this buffer up to date,
2419                  * it will never end up being up to date.  Set ret to EIO now
2420                  * and give up so that our caller doesn't loop forever
2421                  * on our EAGAINs.
2422                  */
2423                 if (!extent_buffer_uptodate(tmp))
2424                         ret = -EIO;
2425                 free_extent_buffer(tmp);
2426         } else {
2427                 ret = PTR_ERR(tmp);
2428         }
2429
2430         btrfs_release_path(p);
2431         return ret;
2432 }
2433
2434 /*
2435  * helper function for btrfs_search_slot.  This does all of the checks
2436  * for node-level blocks and does any balancing required based on
2437  * the ins_len.
2438  *
2439  * If no extra work was required, zero is returned.  If we had to
2440  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2441  * start over
2442  */
2443 static int
2444 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2445                        struct btrfs_root *root, struct btrfs_path *p,
2446                        struct extent_buffer *b, int level, int ins_len,
2447                        int *write_lock_level)
2448 {
2449         struct btrfs_fs_info *fs_info = root->fs_info;
2450         int ret;
2451
2452         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2453             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2454                 int sret;
2455
2456                 if (*write_lock_level < level + 1) {
2457                         *write_lock_level = level + 1;
2458                         btrfs_release_path(p);
2459                         goto again;
2460                 }
2461
2462                 btrfs_set_path_blocking(p);
2463                 reada_for_balance(fs_info, p, level);
2464                 sret = split_node(trans, root, p, level);
2465
2466                 BUG_ON(sret > 0);
2467                 if (sret) {
2468                         ret = sret;
2469                         goto done;
2470                 }
2471                 b = p->nodes[level];
2472         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2473                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2474                 int sret;
2475
2476                 if (*write_lock_level < level + 1) {
2477                         *write_lock_level = level + 1;
2478                         btrfs_release_path(p);
2479                         goto again;
2480                 }
2481
2482                 btrfs_set_path_blocking(p);
2483                 reada_for_balance(fs_info, p, level);
2484                 sret = balance_level(trans, root, p, level);
2485
2486                 if (sret) {
2487                         ret = sret;
2488                         goto done;
2489                 }
2490                 b = p->nodes[level];
2491                 if (!b) {
2492                         btrfs_release_path(p);
2493                         goto again;
2494                 }
2495                 BUG_ON(btrfs_header_nritems(b) == 1);
2496         }
2497         return 0;
2498
2499 again:
2500         ret = -EAGAIN;
2501 done:
2502         return ret;
2503 }
2504
2505 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2506                       int level, int *prev_cmp, int *slot)
2507 {
2508         if (*prev_cmp != 0) {
2509                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2510                 return *prev_cmp;
2511         }
2512
2513         *slot = 0;
2514
2515         return 0;
2516 }
2517
2518 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2519                 u64 iobjectid, u64 ioff, u8 key_type,
2520                 struct btrfs_key *found_key)
2521 {
2522         int ret;
2523         struct btrfs_key key;
2524         struct extent_buffer *eb;
2525
2526         ASSERT(path);
2527         ASSERT(found_key);
2528
2529         key.type = key_type;
2530         key.objectid = iobjectid;
2531         key.offset = ioff;
2532
2533         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2534         if (ret < 0)
2535                 return ret;
2536
2537         eb = path->nodes[0];
2538         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2539                 ret = btrfs_next_leaf(fs_root, path);
2540                 if (ret)
2541                         return ret;
2542                 eb = path->nodes[0];
2543         }
2544
2545         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2546         if (found_key->type != key.type ||
2547                         found_key->objectid != key.objectid)
2548                 return 1;
2549
2550         return 0;
2551 }
2552
2553 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2554                                                         struct btrfs_path *p,
2555                                                         int write_lock_level)
2556 {
2557         struct btrfs_fs_info *fs_info = root->fs_info;
2558         struct extent_buffer *b;
2559         int root_lock;
2560         int level = 0;
2561
2562         /* We try very hard to do read locks on the root */
2563         root_lock = BTRFS_READ_LOCK;
2564
2565         if (p->search_commit_root) {
2566                 /*
2567                  * The commit roots are read only so we always do read locks,
2568                  * and we always must hold the commit_root_sem when doing
2569                  * searches on them, the only exception is send where we don't
2570                  * want to block transaction commits for a long time, so
2571                  * we need to clone the commit root in order to avoid races
2572                  * with transaction commits that create a snapshot of one of
2573                  * the roots used by a send operation.
2574                  */
2575                 if (p->need_commit_sem) {
2576                         down_read(&fs_info->commit_root_sem);
2577                         b = btrfs_clone_extent_buffer(root->commit_root);
2578                         up_read(&fs_info->commit_root_sem);
2579                         if (!b)
2580                                 return ERR_PTR(-ENOMEM);
2581
2582                 } else {
2583                         b = root->commit_root;
2584                         atomic_inc(&b->refs);
2585                 }
2586                 level = btrfs_header_level(b);
2587                 /*
2588                  * Ensure that all callers have set skip_locking when
2589                  * p->search_commit_root = 1.
2590                  */
2591                 ASSERT(p->skip_locking == 1);
2592
2593                 goto out;
2594         }
2595
2596         if (p->skip_locking) {
2597                 b = btrfs_root_node(root);
2598                 level = btrfs_header_level(b);
2599                 goto out;
2600         }
2601
2602         /*
2603          * If the level is set to maximum, we can skip trying to get the read
2604          * lock.
2605          */
2606         if (write_lock_level < BTRFS_MAX_LEVEL) {
2607                 /*
2608                  * We don't know the level of the root node until we actually
2609                  * have it read locked
2610                  */
2611                 b = btrfs_read_lock_root_node(root);
2612                 level = btrfs_header_level(b);
2613                 if (level > write_lock_level)
2614                         goto out;
2615
2616                 /* Whoops, must trade for write lock */
2617                 btrfs_tree_read_unlock(b);
2618                 free_extent_buffer(b);
2619         }
2620
2621         b = btrfs_lock_root_node(root);
2622         root_lock = BTRFS_WRITE_LOCK;
2623
2624         /* The level might have changed, check again */
2625         level = btrfs_header_level(b);
2626
2627 out:
2628         p->nodes[level] = b;
2629         if (!p->skip_locking)
2630                 p->locks[level] = root_lock;
2631         /*
2632          * Callers are responsible for dropping b's references.
2633          */
2634         return b;
2635 }
2636
2637
2638 /*
2639  * btrfs_search_slot - look for a key in a tree and perform necessary
2640  * modifications to preserve tree invariants.
2641  *
2642  * @trans:      Handle of transaction, used when modifying the tree
2643  * @p:          Holds all btree nodes along the search path
2644  * @root:       The root node of the tree
2645  * @key:        The key we are looking for
2646  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2647  *              deletions it's -1. 0 for plain searches
2648  * @cow:        boolean should CoW operations be performed. Must always be 1
2649  *              when modifying the tree.
2650  *
2651  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2652  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2653  *
2654  * If @key is found, 0 is returned and you can find the item in the leaf level
2655  * of the path (level 0)
2656  *
2657  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2658  * points to the slot where it should be inserted
2659  *
2660  * If an error is encountered while searching the tree a negative error number
2661  * is returned
2662  */
2663 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2664                       const struct btrfs_key *key, struct btrfs_path *p,
2665                       int ins_len, int cow)
2666 {
2667         struct extent_buffer *b;
2668         int slot;
2669         int ret;
2670         int err;
2671         int level;
2672         int lowest_unlock = 1;
2673         /* everything at write_lock_level or lower must be write locked */
2674         int write_lock_level = 0;
2675         u8 lowest_level = 0;
2676         int min_write_lock_level;
2677         int prev_cmp;
2678
2679         lowest_level = p->lowest_level;
2680         WARN_ON(lowest_level && ins_len > 0);
2681         WARN_ON(p->nodes[0] != NULL);
2682         BUG_ON(!cow && ins_len);
2683
2684         if (ins_len < 0) {
2685                 lowest_unlock = 2;
2686
2687                 /* when we are removing items, we might have to go up to level
2688                  * two as we update tree pointers  Make sure we keep write
2689                  * for those levels as well
2690                  */
2691                 write_lock_level = 2;
2692         } else if (ins_len > 0) {
2693                 /*
2694                  * for inserting items, make sure we have a write lock on
2695                  * level 1 so we can update keys
2696                  */
2697                 write_lock_level = 1;
2698         }
2699
2700         if (!cow)
2701                 write_lock_level = -1;
2702
2703         if (cow && (p->keep_locks || p->lowest_level))
2704                 write_lock_level = BTRFS_MAX_LEVEL;
2705
2706         min_write_lock_level = write_lock_level;
2707
2708 again:
2709         prev_cmp = -1;
2710         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2711         if (IS_ERR(b)) {
2712                 ret = PTR_ERR(b);
2713                 goto done;
2714         }
2715
2716         while (b) {
2717                 int dec = 0;
2718
2719                 level = btrfs_header_level(b);
2720
2721                 if (cow) {
2722                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2723
2724                         /*
2725                          * if we don't really need to cow this block
2726                          * then we don't want to set the path blocking,
2727                          * so we test it here
2728                          */
2729                         if (!should_cow_block(trans, root, b)) {
2730                                 trans->dirty = true;
2731                                 goto cow_done;
2732                         }
2733
2734                         /*
2735                          * must have write locks on this node and the
2736                          * parent
2737                          */
2738                         if (level > write_lock_level ||
2739                             (level + 1 > write_lock_level &&
2740                             level + 1 < BTRFS_MAX_LEVEL &&
2741                             p->nodes[level + 1])) {
2742                                 write_lock_level = level + 1;
2743                                 btrfs_release_path(p);
2744                                 goto again;
2745                         }
2746
2747                         btrfs_set_path_blocking(p);
2748                         if (last_level)
2749                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2750                                                       &b);
2751                         else
2752                                 err = btrfs_cow_block(trans, root, b,
2753                                                       p->nodes[level + 1],
2754                                                       p->slots[level + 1], &b);
2755                         if (err) {
2756                                 ret = err;
2757                                 goto done;
2758                         }
2759                 }
2760 cow_done:
2761                 p->nodes[level] = b;
2762                 /*
2763                  * Leave path with blocking locks to avoid massive
2764                  * lock context switch, this is made on purpose.
2765                  */
2766
2767                 /*
2768                  * we have a lock on b and as long as we aren't changing
2769                  * the tree, there is no way to for the items in b to change.
2770                  * It is safe to drop the lock on our parent before we
2771                  * go through the expensive btree search on b.
2772                  *
2773                  * If we're inserting or deleting (ins_len != 0), then we might
2774                  * be changing slot zero, which may require changing the parent.
2775                  * So, we can't drop the lock until after we know which slot
2776                  * we're operating on.
2777                  */
2778                 if (!ins_len && !p->keep_locks) {
2779                         int u = level + 1;
2780
2781                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2782                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2783                                 p->locks[u] = 0;
2784                         }
2785                 }
2786
2787                 ret = key_search(b, key, level, &prev_cmp, &slot);
2788                 if (ret < 0)
2789                         goto done;
2790
2791                 if (level == 0) {
2792                         p->slots[level] = slot;
2793                         if (ins_len > 0 &&
2794                             btrfs_leaf_free_space(b) < ins_len) {
2795                                 if (write_lock_level < 1) {
2796                                         write_lock_level = 1;
2797                                         btrfs_release_path(p);
2798                                         goto again;
2799                                 }
2800
2801                                 btrfs_set_path_blocking(p);
2802                                 err = split_leaf(trans, root, key,
2803                                                  p, ins_len, ret == 0);
2804
2805                                 BUG_ON(err > 0);
2806                                 if (err) {
2807                                         ret = err;
2808                                         goto done;
2809                                 }
2810                         }
2811                         if (!p->search_for_split)
2812                                 unlock_up(p, level, lowest_unlock,
2813                                           min_write_lock_level, NULL);
2814                         goto done;
2815                 }
2816                 if (ret && slot > 0) {
2817                         dec = 1;
2818                         slot--;
2819                 }
2820                 p->slots[level] = slot;
2821                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2822                                              &write_lock_level);
2823                 if (err == -EAGAIN)
2824                         goto again;
2825                 if (err) {
2826                         ret = err;
2827                         goto done;
2828                 }
2829                 b = p->nodes[level];
2830                 slot = p->slots[level];
2831
2832                 /*
2833                  * Slot 0 is special, if we change the key we have to update
2834                  * the parent pointer which means we must have a write lock on
2835                  * the parent
2836                  */
2837                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2838                         write_lock_level = level + 1;
2839                         btrfs_release_path(p);
2840                         goto again;
2841                 }
2842
2843                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2844                           &write_lock_level);
2845
2846                 if (level == lowest_level) {
2847                         if (dec)
2848                                 p->slots[level]++;
2849                         goto done;
2850                 }
2851
2852                 err = read_block_for_search(root, p, &b, level, slot, key);
2853                 if (err == -EAGAIN)
2854                         goto again;
2855                 if (err) {
2856                         ret = err;
2857                         goto done;
2858                 }
2859
2860                 if (!p->skip_locking) {
2861                         level = btrfs_header_level(b);
2862                         if (level <= write_lock_level) {
2863                                 if (!btrfs_try_tree_write_lock(b)) {
2864                                         btrfs_set_path_blocking(p);
2865                                         btrfs_tree_lock(b);
2866                                 }
2867                                 p->locks[level] = BTRFS_WRITE_LOCK;
2868                         } else {
2869                                 if (!btrfs_tree_read_lock_atomic(b)) {
2870                                         btrfs_set_path_blocking(p);
2871                                         btrfs_tree_read_lock(b);
2872                                 }
2873                                 p->locks[level] = BTRFS_READ_LOCK;
2874                         }
2875                         p->nodes[level] = b;
2876                 }
2877         }
2878         ret = 1;
2879 done:
2880         /*
2881          * we don't really know what they plan on doing with the path
2882          * from here on, so for now just mark it as blocking
2883          */
2884         if (!p->leave_spinning)
2885                 btrfs_set_path_blocking(p);
2886         if (ret < 0 && !p->skip_release_on_error)
2887                 btrfs_release_path(p);
2888         return ret;
2889 }
2890
2891 /*
2892  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2893  * current state of the tree together with the operations recorded in the tree
2894  * modification log to search for the key in a previous version of this tree, as
2895  * denoted by the time_seq parameter.
2896  *
2897  * Naturally, there is no support for insert, delete or cow operations.
2898  *
2899  * The resulting path and return value will be set up as if we called
2900  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2901  */
2902 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2903                           struct btrfs_path *p, u64 time_seq)
2904 {
2905         struct btrfs_fs_info *fs_info = root->fs_info;
2906         struct extent_buffer *b;
2907         int slot;
2908         int ret;
2909         int err;
2910         int level;
2911         int lowest_unlock = 1;
2912         u8 lowest_level = 0;
2913         int prev_cmp = -1;
2914
2915         lowest_level = p->lowest_level;
2916         WARN_ON(p->nodes[0] != NULL);
2917
2918         if (p->search_commit_root) {
2919                 BUG_ON(time_seq);
2920                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2921         }
2922
2923 again:
2924         b = get_old_root(root, time_seq);
2925         if (!b) {
2926                 ret = -EIO;
2927                 goto done;
2928         }
2929         level = btrfs_header_level(b);
2930         p->locks[level] = BTRFS_READ_LOCK;
2931
2932         while (b) {
2933                 int dec = 0;
2934
2935                 level = btrfs_header_level(b);
2936                 p->nodes[level] = b;
2937
2938                 /*
2939                  * we have a lock on b and as long as we aren't changing
2940                  * the tree, there is no way to for the items in b to change.
2941                  * It is safe to drop the lock on our parent before we
2942                  * go through the expensive btree search on b.
2943                  */
2944                 btrfs_unlock_up_safe(p, level + 1);
2945
2946                 /*
2947                  * Since we can unwind ebs we want to do a real search every
2948                  * time.
2949                  */
2950                 prev_cmp = -1;
2951                 ret = key_search(b, key, level, &prev_cmp, &slot);
2952                 if (ret < 0)
2953                         goto done;
2954
2955                 if (level == 0) {
2956                         p->slots[level] = slot;
2957                         unlock_up(p, level, lowest_unlock, 0, NULL);
2958                         goto done;
2959                 }
2960
2961                 if (ret && slot > 0) {
2962                         dec = 1;
2963                         slot--;
2964                 }
2965                 p->slots[level] = slot;
2966                 unlock_up(p, level, lowest_unlock, 0, NULL);
2967
2968                 if (level == lowest_level) {
2969                         if (dec)
2970                                 p->slots[level]++;
2971                         goto done;
2972                 }
2973
2974                 err = read_block_for_search(root, p, &b, level, slot, key);
2975                 if (err == -EAGAIN)
2976                         goto again;
2977                 if (err) {
2978                         ret = err;
2979                         goto done;
2980                 }
2981
2982                 level = btrfs_header_level(b);
2983                 if (!btrfs_tree_read_lock_atomic(b)) {
2984                         btrfs_set_path_blocking(p);
2985                         btrfs_tree_read_lock(b);
2986                 }
2987                 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2988                 if (!b) {
2989                         ret = -ENOMEM;
2990                         goto done;
2991                 }
2992                 p->locks[level] = BTRFS_READ_LOCK;
2993                 p->nodes[level] = b;
2994         }
2995         ret = 1;
2996 done:
2997         if (!p->leave_spinning)
2998                 btrfs_set_path_blocking(p);
2999         if (ret < 0)
3000                 btrfs_release_path(p);
3001
3002         return ret;
3003 }
3004
3005 /*
3006  * helper to use instead of search slot if no exact match is needed but
3007  * instead the next or previous item should be returned.
3008  * When find_higher is true, the next higher item is returned, the next lower
3009  * otherwise.
3010  * When return_any and find_higher are both true, and no higher item is found,
3011  * return the next lower instead.
3012  * When return_any is true and find_higher is false, and no lower item is found,
3013  * return the next higher instead.
3014  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3015  * < 0 on error
3016  */
3017 int btrfs_search_slot_for_read(struct btrfs_root *root,
3018                                const struct btrfs_key *key,
3019                                struct btrfs_path *p, int find_higher,
3020                                int return_any)
3021 {
3022         int ret;
3023         struct extent_buffer *leaf;
3024
3025 again:
3026         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3027         if (ret <= 0)
3028                 return ret;
3029         /*
3030          * a return value of 1 means the path is at the position where the
3031          * item should be inserted. Normally this is the next bigger item,
3032          * but in case the previous item is the last in a leaf, path points
3033          * to the first free slot in the previous leaf, i.e. at an invalid
3034          * item.
3035          */
3036         leaf = p->nodes[0];
3037
3038         if (find_higher) {
3039                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3040                         ret = btrfs_next_leaf(root, p);
3041                         if (ret <= 0)
3042                                 return ret;
3043                         if (!return_any)
3044                                 return 1;
3045                         /*
3046                          * no higher item found, return the next
3047                          * lower instead
3048                          */
3049                         return_any = 0;
3050                         find_higher = 0;
3051                         btrfs_release_path(p);
3052                         goto again;
3053                 }
3054         } else {
3055                 if (p->slots[0] == 0) {
3056                         ret = btrfs_prev_leaf(root, p);
3057                         if (ret < 0)
3058                                 return ret;
3059                         if (!ret) {
3060                                 leaf = p->nodes[0];
3061                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3062                                         p->slots[0]--;
3063                                 return 0;
3064                         }
3065                         if (!return_any)
3066                                 return 1;
3067                         /*
3068                          * no lower item found, return the next
3069                          * higher instead
3070                          */
3071                         return_any = 0;
3072                         find_higher = 1;
3073                         btrfs_release_path(p);
3074                         goto again;
3075                 } else {
3076                         --p->slots[0];
3077                 }
3078         }
3079         return 0;
3080 }
3081
3082 /*
3083  * adjust the pointers going up the tree, starting at level
3084  * making sure the right key of each node is points to 'key'.
3085  * This is used after shifting pointers to the left, so it stops
3086  * fixing up pointers when a given leaf/node is not in slot 0 of the
3087  * higher levels
3088  *
3089  */
3090 static void fixup_low_keys(struct btrfs_path *path,
3091                            struct btrfs_disk_key *key, int level)
3092 {
3093         int i;
3094         struct extent_buffer *t;
3095         int ret;
3096
3097         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3098                 int tslot = path->slots[i];
3099
3100                 if (!path->nodes[i])
3101                         break;
3102                 t = path->nodes[i];
3103                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3104                                 GFP_ATOMIC);
3105                 BUG_ON(ret < 0);
3106                 btrfs_set_node_key(t, key, tslot);
3107                 btrfs_mark_buffer_dirty(path->nodes[i]);
3108                 if (tslot != 0)
3109                         break;
3110         }
3111 }
3112
3113 /*
3114  * update item key.
3115  *
3116  * This function isn't completely safe. It's the caller's responsibility
3117  * that the new key won't break the order
3118  */
3119 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3120                              struct btrfs_path *path,
3121                              const struct btrfs_key *new_key)
3122 {
3123         struct btrfs_disk_key disk_key;
3124         struct extent_buffer *eb;
3125         int slot;
3126
3127         eb = path->nodes[0];
3128         slot = path->slots[0];
3129         if (slot > 0) {
3130                 btrfs_item_key(eb, &disk_key, slot - 1);
3131                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3132                         btrfs_crit(fs_info,
3133                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3134                                    slot, btrfs_disk_key_objectid(&disk_key),
3135                                    btrfs_disk_key_type(&disk_key),
3136                                    btrfs_disk_key_offset(&disk_key),
3137                                    new_key->objectid, new_key->type,
3138                                    new_key->offset);
3139                         btrfs_print_leaf(eb);
3140                         BUG();
3141                 }
3142         }
3143         if (slot < btrfs_header_nritems(eb) - 1) {
3144                 btrfs_item_key(eb, &disk_key, slot + 1);
3145                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3146                         btrfs_crit(fs_info,
3147                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3148                                    slot, btrfs_disk_key_objectid(&disk_key),
3149                                    btrfs_disk_key_type(&disk_key),
3150                                    btrfs_disk_key_offset(&disk_key),
3151                                    new_key->objectid, new_key->type,
3152                                    new_key->offset);
3153                         btrfs_print_leaf(eb);
3154                         BUG();
3155                 }
3156         }
3157
3158         btrfs_cpu_key_to_disk(&disk_key, new_key);
3159         btrfs_set_item_key(eb, &disk_key, slot);
3160         btrfs_mark_buffer_dirty(eb);
3161         if (slot == 0)
3162                 fixup_low_keys(path, &disk_key, 1);
3163 }
3164
3165 /*
3166  * try to push data from one node into the next node left in the
3167  * tree.
3168  *
3169  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3170  * error, and > 0 if there was no room in the left hand block.
3171  */
3172 static int push_node_left(struct btrfs_trans_handle *trans,
3173                           struct extent_buffer *dst,
3174                           struct extent_buffer *src, int empty)
3175 {
3176         struct btrfs_fs_info *fs_info = trans->fs_info;
3177         int push_items = 0;
3178         int src_nritems;
3179         int dst_nritems;
3180         int ret = 0;
3181
3182         src_nritems = btrfs_header_nritems(src);
3183         dst_nritems = btrfs_header_nritems(dst);
3184         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3185         WARN_ON(btrfs_header_generation(src) != trans->transid);
3186         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3187
3188         if (!empty && src_nritems <= 8)
3189                 return 1;
3190
3191         if (push_items <= 0)
3192                 return 1;
3193
3194         if (empty) {
3195                 push_items = min(src_nritems, push_items);
3196                 if (push_items < src_nritems) {
3197                         /* leave at least 8 pointers in the node if
3198                          * we aren't going to empty it
3199                          */
3200                         if (src_nritems - push_items < 8) {
3201                                 if (push_items <= 8)
3202                                         return 1;
3203                                 push_items -= 8;
3204                         }
3205                 }
3206         } else
3207                 push_items = min(src_nritems - 8, push_items);
3208
3209         ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3210         if (ret) {
3211                 btrfs_abort_transaction(trans, ret);
3212                 return ret;
3213         }
3214         copy_extent_buffer(dst, src,
3215                            btrfs_node_key_ptr_offset(dst_nritems),
3216                            btrfs_node_key_ptr_offset(0),
3217                            push_items * sizeof(struct btrfs_key_ptr));
3218
3219         if (push_items < src_nritems) {
3220                 /*
3221                  * Don't call tree_mod_log_insert_move here, key removal was
3222                  * already fully logged by tree_mod_log_eb_copy above.
3223                  */
3224                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3225                                       btrfs_node_key_ptr_offset(push_items),
3226                                       (src_nritems - push_items) *
3227                                       sizeof(struct btrfs_key_ptr));
3228         }
3229         btrfs_set_header_nritems(src, src_nritems - push_items);
3230         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3231         btrfs_mark_buffer_dirty(src);
3232         btrfs_mark_buffer_dirty(dst);
3233
3234         return ret;
3235 }
3236
3237 /*
3238  * try to push data from one node into the next node right in the
3239  * tree.
3240  *
3241  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3242  * error, and > 0 if there was no room in the right hand block.
3243  *
3244  * this will  only push up to 1/2 the contents of the left node over
3245  */
3246 static int balance_node_right(struct btrfs_trans_handle *trans,
3247                               struct extent_buffer *dst,
3248                               struct extent_buffer *src)
3249 {
3250         struct btrfs_fs_info *fs_info = trans->fs_info;
3251         int push_items = 0;
3252         int max_push;
3253         int src_nritems;
3254         int dst_nritems;
3255         int ret = 0;
3256
3257         WARN_ON(btrfs_header_generation(src) != trans->transid);
3258         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3259
3260         src_nritems = btrfs_header_nritems(src);
3261         dst_nritems = btrfs_header_nritems(dst);
3262         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3263         if (push_items <= 0)
3264                 return 1;
3265
3266         if (src_nritems < 4)
3267                 return 1;
3268
3269         max_push = src_nritems / 2 + 1;
3270         /* don't try to empty the node */
3271         if (max_push >= src_nritems)
3272                 return 1;
3273
3274         if (max_push < push_items)
3275                 push_items = max_push;
3276
3277         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3278         BUG_ON(ret < 0);
3279         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3280                                       btrfs_node_key_ptr_offset(0),
3281                                       (dst_nritems) *
3282                                       sizeof(struct btrfs_key_ptr));
3283
3284         ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3285                                    push_items);
3286         if (ret) {
3287                 btrfs_abort_transaction(trans, ret);
3288                 return ret;
3289         }
3290         copy_extent_buffer(dst, src,
3291                            btrfs_node_key_ptr_offset(0),
3292                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3293                            push_items * sizeof(struct btrfs_key_ptr));
3294
3295         btrfs_set_header_nritems(src, src_nritems - push_items);
3296         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3297
3298         btrfs_mark_buffer_dirty(src);
3299         btrfs_mark_buffer_dirty(dst);
3300
3301         return ret;
3302 }
3303
3304 /*
3305  * helper function to insert a new root level in the tree.
3306  * A new node is allocated, and a single item is inserted to
3307  * point to the existing root
3308  *
3309  * returns zero on success or < 0 on failure.
3310  */
3311 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3312                            struct btrfs_root *root,
3313                            struct btrfs_path *path, int level)
3314 {
3315         struct btrfs_fs_info *fs_info = root->fs_info;
3316         u64 lower_gen;
3317         struct extent_buffer *lower;
3318         struct extent_buffer *c;
3319         struct extent_buffer *old;
3320         struct btrfs_disk_key lower_key;
3321         int ret;
3322
3323         BUG_ON(path->nodes[level]);
3324         BUG_ON(path->nodes[level-1] != root->node);
3325
3326         lower = path->nodes[level-1];
3327         if (level == 1)
3328                 btrfs_item_key(lower, &lower_key, 0);
3329         else
3330                 btrfs_node_key(lower, &lower_key, 0);
3331
3332         c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3333                                          root->node->start, 0);
3334         if (IS_ERR(c))
3335                 return PTR_ERR(c);
3336
3337         root_add_used(root, fs_info->nodesize);
3338
3339         btrfs_set_header_nritems(c, 1);
3340         btrfs_set_node_key(c, &lower_key, 0);
3341         btrfs_set_node_blockptr(c, 0, lower->start);
3342         lower_gen = btrfs_header_generation(lower);
3343         WARN_ON(lower_gen != trans->transid);
3344
3345         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3346
3347         btrfs_mark_buffer_dirty(c);
3348
3349         old = root->node;
3350         ret = tree_mod_log_insert_root(root->node, c, 0);
3351         BUG_ON(ret < 0);
3352         rcu_assign_pointer(root->node, c);
3353
3354         /* the super has an extra ref to root->node */
3355         free_extent_buffer(old);
3356
3357         add_root_to_dirty_list(root);
3358         atomic_inc(&c->refs);
3359         path->nodes[level] = c;
3360         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3361         path->slots[level] = 0;
3362         return 0;
3363 }
3364
3365 /*
3366  * worker function to insert a single pointer in a node.
3367  * the node should have enough room for the pointer already
3368  *
3369  * slot and level indicate where you want the key to go, and
3370  * blocknr is the block the key points to.
3371  */
3372 static void insert_ptr(struct btrfs_trans_handle *trans,
3373                        struct btrfs_path *path,
3374                        struct btrfs_disk_key *key, u64 bytenr,
3375                        int slot, int level)
3376 {
3377         struct extent_buffer *lower;
3378         int nritems;
3379         int ret;
3380
3381         BUG_ON(!path->nodes[level]);
3382         btrfs_assert_tree_locked(path->nodes[level]);
3383         lower = path->nodes[level];
3384         nritems = btrfs_header_nritems(lower);
3385         BUG_ON(slot > nritems);
3386         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3387         if (slot != nritems) {
3388                 if (level) {
3389                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3390                                         nritems - slot);
3391                         BUG_ON(ret < 0);
3392                 }
3393                 memmove_extent_buffer(lower,
3394                               btrfs_node_key_ptr_offset(slot + 1),
3395                               btrfs_node_key_ptr_offset(slot),
3396                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3397         }
3398         if (level) {
3399                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3400                                 GFP_NOFS);
3401                 BUG_ON(ret < 0);
3402         }
3403         btrfs_set_node_key(lower, key, slot);
3404         btrfs_set_node_blockptr(lower, slot, bytenr);
3405         WARN_ON(trans->transid == 0);
3406         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3407         btrfs_set_header_nritems(lower, nritems + 1);
3408         btrfs_mark_buffer_dirty(lower);
3409 }
3410
3411 /*
3412  * split the node at the specified level in path in two.
3413  * The path is corrected to point to the appropriate node after the split
3414  *
3415  * Before splitting this tries to make some room in the node by pushing
3416  * left and right, if either one works, it returns right away.
3417  *
3418  * returns 0 on success and < 0 on failure
3419  */
3420 static noinline int split_node(struct btrfs_trans_handle *trans,
3421                                struct btrfs_root *root,
3422                                struct btrfs_path *path, int level)
3423 {
3424         struct btrfs_fs_info *fs_info = root->fs_info;
3425         struct extent_buffer *c;
3426         struct extent_buffer *split;
3427         struct btrfs_disk_key disk_key;
3428         int mid;
3429         int ret;
3430         u32 c_nritems;
3431
3432         c = path->nodes[level];
3433         WARN_ON(btrfs_header_generation(c) != trans->transid);
3434         if (c == root->node) {
3435                 /*
3436                  * trying to split the root, lets make a new one
3437                  *
3438                  * tree mod log: We don't log_removal old root in
3439                  * insert_new_root, because that root buffer will be kept as a
3440                  * normal node. We are going to log removal of half of the
3441                  * elements below with tree_mod_log_eb_copy. We're holding a
3442                  * tree lock on the buffer, which is why we cannot race with
3443                  * other tree_mod_log users.
3444                  */
3445                 ret = insert_new_root(trans, root, path, level + 1);
3446                 if (ret)
3447                         return ret;
3448         } else {
3449                 ret = push_nodes_for_insert(trans, root, path, level);
3450                 c = path->nodes[level];
3451                 if (!ret && btrfs_header_nritems(c) <
3452                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3453                         return 0;
3454                 if (ret < 0)
3455                         return ret;
3456         }
3457
3458         c_nritems = btrfs_header_nritems(c);
3459         mid = (c_nritems + 1) / 2;
3460         btrfs_node_key(c, &disk_key, mid);
3461
3462         split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3463                                              c->start, 0);
3464         if (IS_ERR(split))
3465                 return PTR_ERR(split);
3466
3467         root_add_used(root, fs_info->nodesize);
3468         ASSERT(btrfs_header_level(c) == level);
3469
3470         ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3471         if (ret) {
3472                 btrfs_abort_transaction(trans, ret);
3473                 return ret;
3474         }
3475         copy_extent_buffer(split, c,
3476                            btrfs_node_key_ptr_offset(0),
3477                            btrfs_node_key_ptr_offset(mid),
3478                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3479         btrfs_set_header_nritems(split, c_nritems - mid);
3480         btrfs_set_header_nritems(c, mid);
3481         ret = 0;
3482
3483         btrfs_mark_buffer_dirty(c);
3484         btrfs_mark_buffer_dirty(split);
3485
3486         insert_ptr(trans, path, &disk_key, split->start,
3487                    path->slots[level + 1] + 1, level + 1);
3488
3489         if (path->slots[level] >= mid) {
3490                 path->slots[level] -= mid;
3491                 btrfs_tree_unlock(c);
3492                 free_extent_buffer(c);
3493                 path->nodes[level] = split;
3494                 path->slots[level + 1] += 1;
3495         } else {
3496                 btrfs_tree_unlock(split);
3497                 free_extent_buffer(split);
3498         }
3499         return ret;
3500 }
3501
3502 /*
3503  * how many bytes are required to store the items in a leaf.  start
3504  * and nr indicate which items in the leaf to check.  This totals up the
3505  * space used both by the item structs and the item data
3506  */
3507 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3508 {
3509         struct btrfs_item *start_item;
3510         struct btrfs_item *end_item;
3511         struct btrfs_map_token token;
3512         int data_len;
3513         int nritems = btrfs_header_nritems(l);
3514         int end = min(nritems, start + nr) - 1;
3515
3516         if (!nr)
3517                 return 0;
3518         btrfs_init_map_token(&token, l);
3519         start_item = btrfs_item_nr(start);
3520         end_item = btrfs_item_nr(end);
3521         data_len = btrfs_token_item_offset(l, start_item, &token) +
3522                 btrfs_token_item_size(l, start_item, &token);
3523         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3524         data_len += sizeof(struct btrfs_item) * nr;
3525         WARN_ON(data_len < 0);
3526         return data_len;
3527 }
3528
3529 /*
3530  * The space between the end of the leaf items and
3531  * the start of the leaf data.  IOW, how much room
3532  * the leaf has left for both items and data
3533  */
3534 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3535 {
3536         struct btrfs_fs_info *fs_info = leaf->fs_info;
3537         int nritems = btrfs_header_nritems(leaf);
3538         int ret;
3539
3540         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3541         if (ret < 0) {
3542                 btrfs_crit(fs_info,
3543                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3544                            ret,
3545                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3546                            leaf_space_used(leaf, 0, nritems), nritems);
3547         }
3548         return ret;
3549 }
3550
3551 /*
3552  * min slot controls the lowest index we're willing to push to the
3553  * right.  We'll push up to and including min_slot, but no lower
3554  */
3555 static noinline int __push_leaf_right(struct btrfs_path *path,
3556                                       int data_size, int empty,
3557                                       struct extent_buffer *right,
3558                                       int free_space, u32 left_nritems,
3559                                       u32 min_slot)
3560 {
3561         struct btrfs_fs_info *fs_info = right->fs_info;
3562         struct extent_buffer *left = path->nodes[0];
3563         struct extent_buffer *upper = path->nodes[1];
3564         struct btrfs_map_token token;
3565         struct btrfs_disk_key disk_key;
3566         int slot;
3567         u32 i;
3568         int push_space = 0;
3569         int push_items = 0;
3570         struct btrfs_item *item;
3571         u32 nr;
3572         u32 right_nritems;
3573         u32 data_end;
3574         u32 this_item_size;
3575
3576         if (empty)
3577                 nr = 0;
3578         else
3579                 nr = max_t(u32, 1, min_slot);
3580
3581         if (path->slots[0] >= left_nritems)
3582                 push_space += data_size;
3583
3584         slot = path->slots[1];
3585         i = left_nritems - 1;
3586         while (i >= nr) {
3587                 item = btrfs_item_nr(i);
3588
3589                 if (!empty && push_items > 0) {
3590                         if (path->slots[0] > i)
3591                                 break;
3592                         if (path->slots[0] == i) {
3593                                 int space = btrfs_leaf_free_space(left);
3594
3595                                 if (space + push_space * 2 > free_space)
3596                                         break;
3597                         }
3598                 }
3599
3600                 if (path->slots[0] == i)
3601                         push_space += data_size;
3602
3603                 this_item_size = btrfs_item_size(left, item);
3604                 if (this_item_size + sizeof(*item) + push_space > free_space)
3605                         break;
3606
3607                 push_items++;
3608                 push_space += this_item_size + sizeof(*item);
3609                 if (i == 0)
3610                         break;
3611                 i--;
3612         }
3613
3614         if (push_items == 0)
3615                 goto out_unlock;
3616
3617         WARN_ON(!empty && push_items == left_nritems);
3618
3619         /* push left to right */
3620         right_nritems = btrfs_header_nritems(right);
3621
3622         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3623         push_space -= leaf_data_end(left);
3624
3625         /* make room in the right data area */
3626         data_end = leaf_data_end(right);
3627         memmove_extent_buffer(right,
3628                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3629                               BTRFS_LEAF_DATA_OFFSET + data_end,
3630                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3631
3632         /* copy from the left data area */
3633         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3634                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3635                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3636                      push_space);
3637
3638         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3639                               btrfs_item_nr_offset(0),
3640                               right_nritems * sizeof(struct btrfs_item));
3641
3642         /* copy the items from left to right */
3643         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3644                    btrfs_item_nr_offset(left_nritems - push_items),
3645                    push_items * sizeof(struct btrfs_item));
3646
3647         /* update the item pointers */
3648         btrfs_init_map_token(&token, right);
3649         right_nritems += push_items;
3650         btrfs_set_header_nritems(right, right_nritems);
3651         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3652         for (i = 0; i < right_nritems; i++) {
3653                 item = btrfs_item_nr(i);
3654                 push_space -= btrfs_token_item_size(right, item, &token);
3655                 btrfs_set_token_item_offset(right, item, push_space, &token);
3656         }
3657
3658         left_nritems -= push_items;
3659         btrfs_set_header_nritems(left, left_nritems);
3660
3661         if (left_nritems)
3662                 btrfs_mark_buffer_dirty(left);
3663         else
3664                 btrfs_clean_tree_block(left);
3665
3666         btrfs_mark_buffer_dirty(right);
3667
3668         btrfs_item_key(right, &disk_key, 0);
3669         btrfs_set_node_key(upper, &disk_key, slot + 1);
3670         btrfs_mark_buffer_dirty(upper);
3671
3672         /* then fixup the leaf pointer in the path */
3673         if (path->slots[0] >= left_nritems) {
3674                 path->slots[0] -= left_nritems;
3675                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3676                         btrfs_clean_tree_block(path->nodes[0]);
3677                 btrfs_tree_unlock(path->nodes[0]);
3678                 free_extent_buffer(path->nodes[0]);
3679                 path->nodes[0] = right;
3680                 path->slots[1] += 1;
3681         } else {
3682                 btrfs_tree_unlock(right);
3683                 free_extent_buffer(right);
3684         }
3685         return 0;
3686
3687 out_unlock:
3688         btrfs_tree_unlock(right);
3689         free_extent_buffer(right);
3690         return 1;
3691 }
3692
3693 /*
3694  * push some data in the path leaf to the right, trying to free up at
3695  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3696  *
3697  * returns 1 if the push failed because the other node didn't have enough
3698  * room, 0 if everything worked out and < 0 if there were major errors.
3699  *
3700  * this will push starting from min_slot to the end of the leaf.  It won't
3701  * push any slot lower than min_slot
3702  */
3703 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3704                            *root, struct btrfs_path *path,
3705                            int min_data_size, int data_size,
3706                            int empty, u32 min_slot)
3707 {
3708         struct extent_buffer *left = path->nodes[0];
3709         struct extent_buffer *right;
3710         struct extent_buffer *upper;
3711         int slot;
3712         int free_space;
3713         u32 left_nritems;
3714         int ret;
3715
3716         if (!path->nodes[1])
3717                 return 1;
3718
3719         slot = path->slots[1];
3720         upper = path->nodes[1];
3721         if (slot >= btrfs_header_nritems(upper) - 1)
3722                 return 1;
3723
3724         btrfs_assert_tree_locked(path->nodes[1]);
3725
3726         right = btrfs_read_node_slot(upper, slot + 1);
3727         /*
3728          * slot + 1 is not valid or we fail to read the right node,
3729          * no big deal, just return.
3730          */
3731         if (IS_ERR(right))
3732                 return 1;
3733
3734         btrfs_tree_lock(right);
3735         btrfs_set_lock_blocking_write(right);
3736
3737         free_space = btrfs_leaf_free_space(right);
3738         if (free_space < data_size)
3739                 goto out_unlock;
3740
3741         /* cow and double check */
3742         ret = btrfs_cow_block(trans, root, right, upper,
3743                               slot + 1, &right);
3744         if (ret)
3745                 goto out_unlock;
3746
3747         free_space = btrfs_leaf_free_space(right);
3748         if (free_space < data_size)
3749                 goto out_unlock;
3750
3751         left_nritems = btrfs_header_nritems(left);
3752         if (left_nritems == 0)
3753                 goto out_unlock;
3754
3755         if (path->slots[0] == left_nritems && !empty) {
3756                 /* Key greater than all keys in the leaf, right neighbor has
3757                  * enough room for it and we're not emptying our leaf to delete
3758                  * it, therefore use right neighbor to insert the new item and
3759                  * no need to touch/dirty our left leaf. */
3760                 btrfs_tree_unlock(left);
3761                 free_extent_buffer(left);
3762                 path->nodes[0] = right;
3763                 path->slots[0] = 0;
3764                 path->slots[1]++;
3765                 return 0;
3766         }
3767
3768         return __push_leaf_right(path, min_data_size, empty,
3769                                 right, free_space, left_nritems, min_slot);
3770 out_unlock:
3771         btrfs_tree_unlock(right);
3772         free_extent_buffer(right);
3773         return 1;
3774 }
3775
3776 /*
3777  * push some data in the path leaf to the left, trying to free up at
3778  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3779  *
3780  * max_slot can put a limit on how far into the leaf we'll push items.  The
3781  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3782  * items
3783  */
3784 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3785                                      int empty, struct extent_buffer *left,
3786                                      int free_space, u32 right_nritems,
3787                                      u32 max_slot)
3788 {
3789         struct btrfs_fs_info *fs_info = left->fs_info;
3790         struct btrfs_disk_key disk_key;
3791         struct extent_buffer *right = path->nodes[0];
3792         int i;
3793         int push_space = 0;
3794         int push_items = 0;
3795         struct btrfs_item *item;
3796         u32 old_left_nritems;
3797         u32 nr;
3798         int ret = 0;
3799         u32 this_item_size;
3800         u32 old_left_item_size;
3801         struct btrfs_map_token token;
3802
3803         if (empty)
3804                 nr = min(right_nritems, max_slot);
3805         else
3806                 nr = min(right_nritems - 1, max_slot);
3807
3808         for (i = 0; i < nr; i++) {
3809                 item = btrfs_item_nr(i);
3810
3811                 if (!empty && push_items > 0) {
3812                         if (path->slots[0] < i)
3813                                 break;
3814                         if (path->slots[0] == i) {
3815                                 int space = btrfs_leaf_free_space(right);
3816
3817                                 if (space + push_space * 2 > free_space)
3818                                         break;
3819                         }
3820                 }
3821
3822                 if (path->slots[0] == i)
3823                         push_space += data_size;
3824
3825                 this_item_size = btrfs_item_size(right, item);
3826                 if (this_item_size + sizeof(*item) + push_space > free_space)
3827                         break;
3828
3829                 push_items++;
3830                 push_space += this_item_size + sizeof(*item);
3831         }
3832
3833         if (push_items == 0) {
3834                 ret = 1;
3835                 goto out;
3836         }
3837         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3838
3839         /* push data from right to left */
3840         copy_extent_buffer(left, right,
3841                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3842                            btrfs_item_nr_offset(0),
3843                            push_items * sizeof(struct btrfs_item));
3844
3845         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3846                      btrfs_item_offset_nr(right, push_items - 1);
3847
3848         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3849                      leaf_data_end(left) - push_space,
3850                      BTRFS_LEAF_DATA_OFFSET +
3851                      btrfs_item_offset_nr(right, push_items - 1),
3852                      push_space);
3853         old_left_nritems = btrfs_header_nritems(left);
3854         BUG_ON(old_left_nritems <= 0);
3855
3856         btrfs_init_map_token(&token, left);
3857         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3858         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3859                 u32 ioff;
3860
3861                 item = btrfs_item_nr(i);
3862
3863                 ioff = btrfs_token_item_offset(left, item, &token);
3864                 btrfs_set_token_item_offset(left, item,
3865                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3866                       &token);
3867         }
3868         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3869
3870         /* fixup right node */
3871         if (push_items > right_nritems)
3872                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3873                        right_nritems);
3874
3875         if (push_items < right_nritems) {
3876                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3877                                                   leaf_data_end(right);
3878                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3879                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3880                                       BTRFS_LEAF_DATA_OFFSET +
3881                                       leaf_data_end(right), push_space);
3882
3883                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3884                               btrfs_item_nr_offset(push_items),
3885                              (btrfs_header_nritems(right) - push_items) *
3886                              sizeof(struct btrfs_item));
3887         }
3888
3889         btrfs_init_map_token(&token, right);
3890         right_nritems -= push_items;
3891         btrfs_set_header_nritems(right, right_nritems);
3892         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3893         for (i = 0; i < right_nritems; i++) {
3894                 item = btrfs_item_nr(i);
3895
3896                 push_space = push_space - btrfs_token_item_size(right,
3897                                                                 item, &token);
3898                 btrfs_set_token_item_offset(right, item, push_space, &token);
3899         }
3900
3901         btrfs_mark_buffer_dirty(left);
3902         if (right_nritems)
3903                 btrfs_mark_buffer_dirty(right);
3904         else
3905                 btrfs_clean_tree_block(right);
3906
3907         btrfs_item_key(right, &disk_key, 0);
3908         fixup_low_keys(path, &disk_key, 1);
3909
3910         /* then fixup the leaf pointer in the path */
3911         if (path->slots[0] < push_items) {
3912                 path->slots[0] += old_left_nritems;
3913                 btrfs_tree_unlock(path->nodes[0]);
3914                 free_extent_buffer(path->nodes[0]);
3915                 path->nodes[0] = left;
3916                 path->slots[1] -= 1;
3917         } else {
3918                 btrfs_tree_unlock(left);
3919                 free_extent_buffer(left);
3920                 path->slots[0] -= push_items;
3921         }
3922         BUG_ON(path->slots[0] < 0);
3923         return ret;
3924 out:
3925         btrfs_tree_unlock(left);
3926         free_extent_buffer(left);
3927         return ret;
3928 }
3929
3930 /*
3931  * push some data in the path leaf to the left, trying to free up at
3932  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3933  *
3934  * max_slot can put a limit on how far into the leaf we'll push items.  The
3935  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3936  * items
3937  */
3938 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3939                           *root, struct btrfs_path *path, int min_data_size,
3940                           int data_size, int empty, u32 max_slot)
3941 {
3942         struct extent_buffer *right = path->nodes[0];
3943         struct extent_buffer *left;
3944         int slot;
3945         int free_space;
3946         u32 right_nritems;
3947         int ret = 0;
3948
3949         slot = path->slots[1];
3950         if (slot == 0)
3951                 return 1;
3952         if (!path->nodes[1])
3953                 return 1;
3954
3955         right_nritems = btrfs_header_nritems(right);
3956         if (right_nritems == 0)
3957                 return 1;
3958
3959         btrfs_assert_tree_locked(path->nodes[1]);
3960
3961         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3962         /*
3963          * slot - 1 is not valid or we fail to read the left node,
3964          * no big deal, just return.
3965          */
3966         if (IS_ERR(left))
3967                 return 1;
3968
3969         btrfs_tree_lock(left);
3970         btrfs_set_lock_blocking_write(left);
3971
3972         free_space = btrfs_leaf_free_space(left);
3973         if (free_space < data_size) {
3974                 ret = 1;
3975                 goto out;
3976         }
3977
3978         /* cow and double check */
3979         ret = btrfs_cow_block(trans, root, left,
3980                               path->nodes[1], slot - 1, &left);
3981         if (ret) {
3982                 /* we hit -ENOSPC, but it isn't fatal here */
3983                 if (ret == -ENOSPC)
3984                         ret = 1;
3985                 goto out;
3986         }
3987
3988         free_space = btrfs_leaf_free_space(left);
3989         if (free_space < data_size) {
3990                 ret = 1;
3991                 goto out;
3992         }
3993
3994         return __push_leaf_left(path, min_data_size,
3995                                empty, left, free_space, right_nritems,
3996                                max_slot);
3997 out:
3998         btrfs_tree_unlock(left);
3999         free_extent_buffer(left);
4000         return ret;
4001 }
4002
4003 /*
4004  * split the path's leaf in two, making sure there is at least data_size
4005  * available for the resulting leaf level of the path.
4006  */
4007 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4008                                     struct btrfs_path *path,
4009                                     struct extent_buffer *l,
4010                                     struct extent_buffer *right,
4011                                     int slot, int mid, int nritems)
4012 {
4013         struct btrfs_fs_info *fs_info = trans->fs_info;
4014         int data_copy_size;
4015         int rt_data_off;
4016         int i;
4017         struct btrfs_disk_key disk_key;
4018         struct btrfs_map_token token;
4019
4020         nritems = nritems - mid;
4021         btrfs_set_header_nritems(right, nritems);
4022         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4023
4024         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4025                            btrfs_item_nr_offset(mid),
4026                            nritems * sizeof(struct btrfs_item));
4027
4028         copy_extent_buffer(right, l,
4029                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4030                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4031                      leaf_data_end(l), data_copy_size);
4032
4033         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4034
4035         btrfs_init_map_token(&token, right);
4036         for (i = 0; i < nritems; i++) {
4037                 struct btrfs_item *item = btrfs_item_nr(i);
4038                 u32 ioff;
4039
4040                 ioff = btrfs_token_item_offset(right, item, &token);
4041                 btrfs_set_token_item_offset(right, item,
4042                                             ioff + rt_data_off, &token);
4043         }
4044
4045         btrfs_set_header_nritems(l, mid);
4046         btrfs_item_key(right, &disk_key, 0);
4047         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4048
4049         btrfs_mark_buffer_dirty(right);
4050         btrfs_mark_buffer_dirty(l);
4051         BUG_ON(path->slots[0] != slot);
4052
4053         if (mid <= slot) {
4054                 btrfs_tree_unlock(path->nodes[0]);
4055                 free_extent_buffer(path->nodes[0]);
4056                 path->nodes[0] = right;
4057                 path->slots[0] -= mid;
4058                 path->slots[1] += 1;
4059         } else {
4060                 btrfs_tree_unlock(right);
4061                 free_extent_buffer(right);
4062         }
4063
4064         BUG_ON(path->slots[0] < 0);
4065 }
4066
4067 /*
4068  * double splits happen when we need to insert a big item in the middle
4069  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4070  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4071  *          A                 B                 C
4072  *
4073  * We avoid this by trying to push the items on either side of our target
4074  * into the adjacent leaves.  If all goes well we can avoid the double split
4075  * completely.
4076  */
4077 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4078                                           struct btrfs_root *root,
4079                                           struct btrfs_path *path,
4080                                           int data_size)
4081 {
4082         int ret;
4083         int progress = 0;
4084         int slot;
4085         u32 nritems;
4086         int space_needed = data_size;
4087
4088         slot = path->slots[0];
4089         if (slot < btrfs_header_nritems(path->nodes[0]))
4090                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4091
4092         /*
4093          * try to push all the items after our slot into the
4094          * right leaf
4095          */
4096         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4097         if (ret < 0)
4098                 return ret;
4099
4100         if (ret == 0)
4101                 progress++;
4102
4103         nritems = btrfs_header_nritems(path->nodes[0]);
4104         /*
4105          * our goal is to get our slot at the start or end of a leaf.  If
4106          * we've done so we're done
4107          */
4108         if (path->slots[0] == 0 || path->slots[0] == nritems)
4109                 return 0;
4110
4111         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4112                 return 0;
4113
4114         /* try to push all the items before our slot into the next leaf */
4115         slot = path->slots[0];
4116         space_needed = data_size;
4117         if (slot > 0)
4118                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4119         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4120         if (ret < 0)
4121                 return ret;
4122
4123         if (ret == 0)
4124                 progress++;
4125
4126         if (progress)
4127                 return 0;
4128         return 1;
4129 }
4130
4131 /*
4132  * split the path's leaf in two, making sure there is at least data_size
4133  * available for the resulting leaf level of the path.
4134  *
4135  * returns 0 if all went well and < 0 on failure.
4136  */
4137 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4138                                struct btrfs_root *root,
4139                                const struct btrfs_key *ins_key,
4140                                struct btrfs_path *path, int data_size,
4141                                int extend)
4142 {
4143         struct btrfs_disk_key disk_key;
4144         struct extent_buffer *l;
4145         u32 nritems;
4146         int mid;
4147         int slot;
4148         struct extent_buffer *right;
4149         struct btrfs_fs_info *fs_info = root->fs_info;
4150         int ret = 0;
4151         int wret;
4152         int split;
4153         int num_doubles = 0;
4154         int tried_avoid_double = 0;
4155
4156         l = path->nodes[0];
4157         slot = path->slots[0];
4158         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4159             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4160                 return -EOVERFLOW;
4161
4162         /* first try to make some room by pushing left and right */
4163         if (data_size && path->nodes[1]) {
4164                 int space_needed = data_size;
4165
4166                 if (slot < btrfs_header_nritems(l))
4167                         space_needed -= btrfs_leaf_free_space(l);
4168
4169                 wret = push_leaf_right(trans, root, path, space_needed,
4170                                        space_needed, 0, 0);
4171                 if (wret < 0)
4172                         return wret;
4173                 if (wret) {
4174                         space_needed = data_size;
4175                         if (slot > 0)
4176                                 space_needed -= btrfs_leaf_free_space(l);
4177                         wret = push_leaf_left(trans, root, path, space_needed,
4178                                               space_needed, 0, (u32)-1);
4179                         if (wret < 0)
4180                                 return wret;
4181                 }
4182                 l = path->nodes[0];
4183
4184                 /* did the pushes work? */
4185                 if (btrfs_leaf_free_space(l) >= data_size)
4186                         return 0;
4187         }
4188
4189         if (!path->nodes[1]) {
4190                 ret = insert_new_root(trans, root, path, 1);
4191                 if (ret)
4192                         return ret;
4193         }
4194 again:
4195         split = 1;
4196         l = path->nodes[0];
4197         slot = path->slots[0];
4198         nritems = btrfs_header_nritems(l);
4199         mid = (nritems + 1) / 2;
4200
4201         if (mid <= slot) {
4202                 if (nritems == 1 ||
4203                     leaf_space_used(l, mid, nritems - mid) + data_size >
4204                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4205                         if (slot >= nritems) {
4206                                 split = 0;
4207                         } else {
4208                                 mid = slot;
4209                                 if (mid != nritems &&
4210                                     leaf_space_used(l, mid, nritems - mid) +
4211                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4212                                         if (data_size && !tried_avoid_double)
4213                                                 goto push_for_double;
4214                                         split = 2;
4215                                 }
4216                         }
4217                 }
4218         } else {
4219                 if (leaf_space_used(l, 0, mid) + data_size >
4220                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4221                         if (!extend && data_size && slot == 0) {
4222                                 split = 0;
4223                         } else if ((extend || !data_size) && slot == 0) {
4224                                 mid = 1;
4225                         } else {
4226                                 mid = slot;
4227                                 if (mid != nritems &&
4228                                     leaf_space_used(l, mid, nritems - mid) +
4229                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4230                                         if (data_size && !tried_avoid_double)
4231                                                 goto push_for_double;
4232                                         split = 2;
4233                                 }
4234                         }
4235                 }
4236         }
4237
4238         if (split == 0)
4239                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4240         else
4241                 btrfs_item_key(l, &disk_key, mid);
4242
4243         right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4244                                              l->start, 0);
4245         if (IS_ERR(right))
4246                 return PTR_ERR(right);
4247
4248         root_add_used(root, fs_info->nodesize);
4249
4250         if (split == 0) {
4251                 if (mid <= slot) {
4252                         btrfs_set_header_nritems(right, 0);
4253                         insert_ptr(trans, path, &disk_key,
4254                                    right->start, path->slots[1] + 1, 1);
4255                         btrfs_tree_unlock(path->nodes[0]);
4256                         free_extent_buffer(path->nodes[0]);
4257                         path->nodes[0] = right;
4258                         path->slots[0] = 0;
4259                         path->slots[1] += 1;
4260                 } else {
4261                         btrfs_set_header_nritems(right, 0);
4262                         insert_ptr(trans, path, &disk_key,
4263                                    right->start, path->slots[1], 1);
4264                         btrfs_tree_unlock(path->nodes[0]);
4265                         free_extent_buffer(path->nodes[0]);
4266                         path->nodes[0] = right;
4267                         path->slots[0] = 0;
4268                         if (path->slots[1] == 0)
4269                                 fixup_low_keys(path, &disk_key, 1);
4270                 }
4271                 /*
4272                  * We create a new leaf 'right' for the required ins_len and
4273                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4274                  * the content of ins_len to 'right'.
4275                  */
4276                 return ret;
4277         }
4278
4279         copy_for_split(trans, path, l, right, slot, mid, nritems);
4280
4281         if (split == 2) {
4282                 BUG_ON(num_doubles != 0);
4283                 num_doubles++;
4284                 goto again;
4285         }
4286
4287         return 0;
4288
4289 push_for_double:
4290         push_for_double_split(trans, root, path, data_size);
4291         tried_avoid_double = 1;
4292         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4293                 return 0;
4294         goto again;
4295 }
4296
4297 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4298                                          struct btrfs_root *root,
4299                                          struct btrfs_path *path, int ins_len)
4300 {
4301         struct btrfs_key key;
4302         struct extent_buffer *leaf;
4303         struct btrfs_file_extent_item *fi;
4304         u64 extent_len = 0;
4305         u32 item_size;
4306         int ret;
4307
4308         leaf = path->nodes[0];
4309         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4310
4311         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4312                key.type != BTRFS_EXTENT_CSUM_KEY);
4313
4314         if (btrfs_leaf_free_space(leaf) >= ins_len)
4315                 return 0;
4316
4317         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4318         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4319                 fi = btrfs_item_ptr(leaf, path->slots[0],
4320                                     struct btrfs_file_extent_item);
4321                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4322         }
4323         btrfs_release_path(path);
4324
4325         path->keep_locks = 1;
4326         path->search_for_split = 1;
4327         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4328         path->search_for_split = 0;
4329         if (ret > 0)
4330                 ret = -EAGAIN;
4331         if (ret < 0)
4332                 goto err;
4333
4334         ret = -EAGAIN;
4335         leaf = path->nodes[0];
4336         /* if our item isn't there, return now */
4337         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4338                 goto err;
4339
4340         /* the leaf has  changed, it now has room.  return now */
4341         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4342                 goto err;
4343
4344         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4345                 fi = btrfs_item_ptr(leaf, path->slots[0],
4346                                     struct btrfs_file_extent_item);
4347                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4348                         goto err;
4349         }
4350
4351         btrfs_set_path_blocking(path);
4352         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4353         if (ret)
4354                 goto err;
4355
4356         path->keep_locks = 0;
4357         btrfs_unlock_up_safe(path, 1);
4358         return 0;
4359 err:
4360         path->keep_locks = 0;
4361         return ret;
4362 }
4363
4364 static noinline int split_item(struct btrfs_path *path,
4365                                const struct btrfs_key *new_key,
4366                                unsigned long split_offset)
4367 {
4368         struct extent_buffer *leaf;
4369         struct btrfs_item *item;
4370         struct btrfs_item *new_item;
4371         int slot;
4372         char *buf;
4373         u32 nritems;
4374         u32 item_size;
4375         u32 orig_offset;
4376         struct btrfs_disk_key disk_key;
4377
4378         leaf = path->nodes[0];
4379         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4380
4381         btrfs_set_path_blocking(path);
4382
4383         item = btrfs_item_nr(path->slots[0]);
4384         orig_offset = btrfs_item_offset(leaf, item);
4385         item_size = btrfs_item_size(leaf, item);
4386
4387         buf = kmalloc(item_size, GFP_NOFS);
4388         if (!buf)
4389                 return -ENOMEM;
4390
4391         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4392                             path->slots[0]), item_size);
4393
4394         slot = path->slots[0] + 1;
4395         nritems = btrfs_header_nritems(leaf);
4396         if (slot != nritems) {
4397                 /* shift the items */
4398                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4399                                 btrfs_item_nr_offset(slot),
4400                                 (nritems - slot) * sizeof(struct btrfs_item));
4401         }
4402
4403         btrfs_cpu_key_to_disk(&disk_key, new_key);
4404         btrfs_set_item_key(leaf, &disk_key, slot);
4405
4406         new_item = btrfs_item_nr(slot);
4407
4408         btrfs_set_item_offset(leaf, new_item, orig_offset);
4409         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4410
4411         btrfs_set_item_offset(leaf, item,
4412                               orig_offset + item_size - split_offset);
4413         btrfs_set_item_size(leaf, item, split_offset);
4414
4415         btrfs_set_header_nritems(leaf, nritems + 1);
4416
4417         /* write the data for the start of the original item */
4418         write_extent_buffer(leaf, buf,
4419                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4420                             split_offset);
4421
4422         /* write the data for the new item */
4423         write_extent_buffer(leaf, buf + split_offset,
4424                             btrfs_item_ptr_offset(leaf, slot),
4425                             item_size - split_offset);
4426         btrfs_mark_buffer_dirty(leaf);
4427
4428         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4429         kfree(buf);
4430         return 0;
4431 }
4432
4433 /*
4434  * This function splits a single item into two items,
4435  * giving 'new_key' to the new item and splitting the
4436  * old one at split_offset (from the start of the item).
4437  *
4438  * The path may be released by this operation.  After
4439  * the split, the path is pointing to the old item.  The
4440  * new item is going to be in the same node as the old one.
4441  *
4442  * Note, the item being split must be smaller enough to live alone on
4443  * a tree block with room for one extra struct btrfs_item
4444  *
4445  * This allows us to split the item in place, keeping a lock on the
4446  * leaf the entire time.
4447  */
4448 int btrfs_split_item(struct btrfs_trans_handle *trans,
4449                      struct btrfs_root *root,
4450                      struct btrfs_path *path,
4451                      const struct btrfs_key *new_key,
4452                      unsigned long split_offset)
4453 {
4454         int ret;
4455         ret = setup_leaf_for_split(trans, root, path,
4456                                    sizeof(struct btrfs_item));
4457         if (ret)
4458                 return ret;
4459
4460         ret = split_item(path, new_key, split_offset);
4461         return ret;
4462 }
4463
4464 /*
4465  * This function duplicate a item, giving 'new_key' to the new item.
4466  * It guarantees both items live in the same tree leaf and the new item
4467  * is contiguous with the original item.
4468  *
4469  * This allows us to split file extent in place, keeping a lock on the
4470  * leaf the entire time.
4471  */
4472 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4473                          struct btrfs_root *root,
4474                          struct btrfs_path *path,
4475                          const struct btrfs_key *new_key)
4476 {
4477         struct extent_buffer *leaf;
4478         int ret;
4479         u32 item_size;
4480
4481         leaf = path->nodes[0];
4482         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4483         ret = setup_leaf_for_split(trans, root, path,
4484                                    item_size + sizeof(struct btrfs_item));
4485         if (ret)
4486                 return ret;
4487
4488         path->slots[0]++;
4489         setup_items_for_insert(root, path, new_key, &item_size,
4490                                item_size, item_size +
4491                                sizeof(struct btrfs_item), 1);
4492         leaf = path->nodes[0];
4493         memcpy_extent_buffer(leaf,
4494                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4495                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4496                              item_size);
4497         return 0;
4498 }
4499
4500 /*
4501  * make the item pointed to by the path smaller.  new_size indicates
4502  * how small to make it, and from_end tells us if we just chop bytes
4503  * off the end of the item or if we shift the item to chop bytes off
4504  * the front.
4505  */
4506 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4507 {
4508         int slot;
4509         struct extent_buffer *leaf;
4510         struct btrfs_item *item;
4511         u32 nritems;
4512         unsigned int data_end;
4513         unsigned int old_data_start;
4514         unsigned int old_size;
4515         unsigned int size_diff;
4516         int i;
4517         struct btrfs_map_token token;
4518
4519         leaf = path->nodes[0];
4520         slot = path->slots[0];
4521
4522         old_size = btrfs_item_size_nr(leaf, slot);
4523         if (old_size == new_size)
4524                 return;
4525
4526         nritems = btrfs_header_nritems(leaf);
4527         data_end = leaf_data_end(leaf);
4528
4529         old_data_start = btrfs_item_offset_nr(leaf, slot);
4530
4531         size_diff = old_size - new_size;
4532
4533         BUG_ON(slot < 0);
4534         BUG_ON(slot >= nritems);
4535
4536         /*
4537          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4538          */
4539         /* first correct the data pointers */
4540         btrfs_init_map_token(&token, leaf);
4541         for (i = slot; i < nritems; i++) {
4542                 u32 ioff;
4543                 item = btrfs_item_nr(i);
4544
4545                 ioff = btrfs_token_item_offset(leaf, item, &token);
4546                 btrfs_set_token_item_offset(leaf, item,
4547                                             ioff + size_diff, &token);
4548         }
4549
4550         /* shift the data */
4551         if (from_end) {
4552                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4553                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4554                               data_end, old_data_start + new_size - data_end);
4555         } else {
4556                 struct btrfs_disk_key disk_key;
4557                 u64 offset;
4558
4559                 btrfs_item_key(leaf, &disk_key, slot);
4560
4561                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4562                         unsigned long ptr;
4563                         struct btrfs_file_extent_item *fi;
4564
4565                         fi = btrfs_item_ptr(leaf, slot,
4566                                             struct btrfs_file_extent_item);
4567                         fi = (struct btrfs_file_extent_item *)(
4568                              (unsigned long)fi - size_diff);
4569
4570                         if (btrfs_file_extent_type(leaf, fi) ==
4571                             BTRFS_FILE_EXTENT_INLINE) {
4572                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4573                                 memmove_extent_buffer(leaf, ptr,
4574                                       (unsigned long)fi,
4575                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4576                         }
4577                 }
4578
4579                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4580                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4581                               data_end, old_data_start - data_end);
4582
4583                 offset = btrfs_disk_key_offset(&disk_key);
4584                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4585                 btrfs_set_item_key(leaf, &disk_key, slot);
4586                 if (slot == 0)
4587                         fixup_low_keys(path, &disk_key, 1);
4588         }
4589
4590         item = btrfs_item_nr(slot);
4591         btrfs_set_item_size(leaf, item, new_size);
4592         btrfs_mark_buffer_dirty(leaf);
4593
4594         if (btrfs_leaf_free_space(leaf) < 0) {
4595                 btrfs_print_leaf(leaf);
4596                 BUG();
4597         }
4598 }
4599
4600 /*
4601  * make the item pointed to by the path bigger, data_size is the added size.
4602  */
4603 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4604 {
4605         int slot;
4606         struct extent_buffer *leaf;
4607         struct btrfs_item *item;
4608         u32 nritems;
4609         unsigned int data_end;
4610         unsigned int old_data;
4611         unsigned int old_size;
4612         int i;
4613         struct btrfs_map_token token;
4614
4615         leaf = path->nodes[0];
4616
4617         nritems = btrfs_header_nritems(leaf);
4618         data_end = leaf_data_end(leaf);
4619
4620         if (btrfs_leaf_free_space(leaf) < data_size) {
4621                 btrfs_print_leaf(leaf);
4622                 BUG();
4623         }
4624         slot = path->slots[0];
4625         old_data = btrfs_item_end_nr(leaf, slot);
4626
4627         BUG_ON(slot < 0);
4628         if (slot >= nritems) {
4629                 btrfs_print_leaf(leaf);
4630                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4631                            slot, nritems);
4632                 BUG();
4633         }
4634
4635         /*
4636          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4637          */
4638         /* first correct the data pointers */
4639         btrfs_init_map_token(&token, leaf);
4640         for (i = slot; i < nritems; i++) {
4641                 u32 ioff;
4642                 item = btrfs_item_nr(i);
4643
4644                 ioff = btrfs_token_item_offset(leaf, item, &token);
4645                 btrfs_set_token_item_offset(leaf, item,
4646                                             ioff - data_size, &token);
4647         }
4648
4649         /* shift the data */
4650         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4651                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4652                       data_end, old_data - data_end);
4653
4654         data_end = old_data;
4655         old_size = btrfs_item_size_nr(leaf, slot);
4656         item = btrfs_item_nr(slot);
4657         btrfs_set_item_size(leaf, item, old_size + data_size);
4658         btrfs_mark_buffer_dirty(leaf);
4659
4660         if (btrfs_leaf_free_space(leaf) < 0) {
4661                 btrfs_print_leaf(leaf);
4662                 BUG();
4663         }
4664 }
4665
4666 /*
4667  * this is a helper for btrfs_insert_empty_items, the main goal here is
4668  * to save stack depth by doing the bulk of the work in a function
4669  * that doesn't call btrfs_search_slot
4670  */
4671 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4672                             const struct btrfs_key *cpu_key, u32 *data_size,
4673                             u32 total_data, u32 total_size, int nr)
4674 {
4675         struct btrfs_fs_info *fs_info = root->fs_info;
4676         struct btrfs_item *item;
4677         int i;
4678         u32 nritems;
4679         unsigned int data_end;
4680         struct btrfs_disk_key disk_key;
4681         struct extent_buffer *leaf;
4682         int slot;
4683         struct btrfs_map_token token;
4684
4685         if (path->slots[0] == 0) {
4686                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4687                 fixup_low_keys(path, &disk_key, 1);
4688         }
4689         btrfs_unlock_up_safe(path, 1);
4690
4691         leaf = path->nodes[0];
4692         slot = path->slots[0];
4693
4694         nritems = btrfs_header_nritems(leaf);
4695         data_end = leaf_data_end(leaf);
4696
4697         if (btrfs_leaf_free_space(leaf) < total_size) {
4698                 btrfs_print_leaf(leaf);
4699                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4700                            total_size, btrfs_leaf_free_space(leaf));
4701                 BUG();
4702         }
4703
4704         btrfs_init_map_token(&token, leaf);
4705         if (slot != nritems) {
4706                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4707
4708                 if (old_data < data_end) {
4709                         btrfs_print_leaf(leaf);
4710                         btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4711                                    slot, old_data, data_end);
4712                         BUG();
4713                 }
4714                 /*
4715                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4716                  */
4717                 /* first correct the data pointers */
4718                 for (i = slot; i < nritems; i++) {
4719                         u32 ioff;
4720
4721                         item = btrfs_item_nr(i);
4722                         ioff = btrfs_token_item_offset(leaf, item, &token);
4723                         btrfs_set_token_item_offset(leaf, item,
4724                                                     ioff - total_data, &token);
4725                 }
4726                 /* shift the items */
4727                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4728                               btrfs_item_nr_offset(slot),
4729                               (nritems - slot) * sizeof(struct btrfs_item));
4730
4731                 /* shift the data */
4732                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4733                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4734                               data_end, old_data - data_end);
4735                 data_end = old_data;
4736         }
4737
4738         /* setup the item for the new data */
4739         for (i = 0; i < nr; i++) {
4740                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4741                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4742                 item = btrfs_item_nr(slot + i);
4743                 btrfs_set_token_item_offset(leaf, item,
4744                                             data_end - data_size[i], &token);
4745                 data_end -= data_size[i];
4746                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4747         }
4748
4749         btrfs_set_header_nritems(leaf, nritems + nr);
4750         btrfs_mark_buffer_dirty(leaf);
4751
4752         if (btrfs_leaf_free_space(leaf) < 0) {
4753                 btrfs_print_leaf(leaf);
4754                 BUG();
4755         }
4756 }
4757
4758 /*
4759  * Given a key and some data, insert items into the tree.
4760  * This does all the path init required, making room in the tree if needed.
4761  */
4762 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4763                             struct btrfs_root *root,
4764                             struct btrfs_path *path,
4765                             const struct btrfs_key *cpu_key, u32 *data_size,
4766                             int nr)
4767 {
4768         int ret = 0;
4769         int slot;
4770         int i;
4771         u32 total_size = 0;
4772         u32 total_data = 0;
4773
4774         for (i = 0; i < nr; i++)
4775                 total_data += data_size[i];
4776
4777         total_size = total_data + (nr * sizeof(struct btrfs_item));
4778         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4779         if (ret == 0)
4780                 return -EEXIST;
4781         if (ret < 0)
4782                 return ret;
4783
4784         slot = path->slots[0];
4785         BUG_ON(slot < 0);
4786
4787         setup_items_for_insert(root, path, cpu_key, data_size,
4788                                total_data, total_size, nr);
4789         return 0;
4790 }
4791
4792 /*
4793  * Given a key and some data, insert an item into the tree.
4794  * This does all the path init required, making room in the tree if needed.
4795  */
4796 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4797                       const struct btrfs_key *cpu_key, void *data,
4798                       u32 data_size)
4799 {
4800         int ret = 0;
4801         struct btrfs_path *path;
4802         struct extent_buffer *leaf;
4803         unsigned long ptr;
4804
4805         path = btrfs_alloc_path();
4806         if (!path)
4807                 return -ENOMEM;
4808         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4809         if (!ret) {
4810                 leaf = path->nodes[0];
4811                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4812                 write_extent_buffer(leaf, data, ptr, data_size);
4813                 btrfs_mark_buffer_dirty(leaf);
4814         }
4815         btrfs_free_path(path);
4816         return ret;
4817 }
4818
4819 /*
4820  * delete the pointer from a given node.
4821  *
4822  * the tree should have been previously balanced so the deletion does not
4823  * empty a node.
4824  */
4825 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4826                     int level, int slot)
4827 {
4828         struct extent_buffer *parent = path->nodes[level];
4829         u32 nritems;
4830         int ret;
4831
4832         nritems = btrfs_header_nritems(parent);
4833         if (slot != nritems - 1) {
4834                 if (level) {
4835                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4836                                         nritems - slot - 1);
4837                         BUG_ON(ret < 0);
4838                 }
4839                 memmove_extent_buffer(parent,
4840                               btrfs_node_key_ptr_offset(slot),
4841                               btrfs_node_key_ptr_offset(slot + 1),
4842                               sizeof(struct btrfs_key_ptr) *
4843                               (nritems - slot - 1));
4844         } else if (level) {
4845                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4846                                 GFP_NOFS);
4847                 BUG_ON(ret < 0);
4848         }
4849
4850         nritems--;
4851         btrfs_set_header_nritems(parent, nritems);
4852         if (nritems == 0 && parent == root->node) {
4853                 BUG_ON(btrfs_header_level(root->node) != 1);
4854                 /* just turn the root into a leaf and break */
4855                 btrfs_set_header_level(root->node, 0);
4856         } else if (slot == 0) {
4857                 struct btrfs_disk_key disk_key;
4858
4859                 btrfs_node_key(parent, &disk_key, 0);
4860                 fixup_low_keys(path, &disk_key, level + 1);
4861         }
4862         btrfs_mark_buffer_dirty(parent);
4863 }
4864
4865 /*
4866  * a helper function to delete the leaf pointed to by path->slots[1] and
4867  * path->nodes[1].
4868  *
4869  * This deletes the pointer in path->nodes[1] and frees the leaf
4870  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4871  *
4872  * The path must have already been setup for deleting the leaf, including
4873  * all the proper balancing.  path->nodes[1] must be locked.
4874  */
4875 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4876                                     struct btrfs_root *root,
4877                                     struct btrfs_path *path,
4878                                     struct extent_buffer *leaf)
4879 {
4880         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4881         del_ptr(root, path, 1, path->slots[1]);
4882
4883         /*
4884          * btrfs_free_extent is expensive, we want to make sure we
4885          * aren't holding any locks when we call it
4886          */
4887         btrfs_unlock_up_safe(path, 0);
4888
4889         root_sub_used(root, leaf->len);
4890
4891         atomic_inc(&leaf->refs);
4892         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4893         free_extent_buffer_stale(leaf);
4894 }
4895 /*
4896  * delete the item at the leaf level in path.  If that empties
4897  * the leaf, remove it from the tree
4898  */
4899 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4900                     struct btrfs_path *path, int slot, int nr)
4901 {
4902         struct btrfs_fs_info *fs_info = root->fs_info;
4903         struct extent_buffer *leaf;
4904         struct btrfs_item *item;
4905         u32 last_off;
4906         u32 dsize = 0;
4907         int ret = 0;
4908         int wret;
4909         int i;
4910         u32 nritems;
4911
4912         leaf = path->nodes[0];
4913         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4914
4915         for (i = 0; i < nr; i++)
4916                 dsize += btrfs_item_size_nr(leaf, slot + i);
4917
4918         nritems = btrfs_header_nritems(leaf);
4919
4920         if (slot + nr != nritems) {
4921                 int data_end = leaf_data_end(leaf);
4922                 struct btrfs_map_token token;
4923
4924                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4925                               data_end + dsize,
4926                               BTRFS_LEAF_DATA_OFFSET + data_end,
4927                               last_off - data_end);
4928
4929                 btrfs_init_map_token(&token, leaf);
4930                 for (i = slot + nr; i < nritems; i++) {
4931                         u32 ioff;
4932
4933                         item = btrfs_item_nr(i);
4934                         ioff = btrfs_token_item_offset(leaf, item, &token);
4935                         btrfs_set_token_item_offset(leaf, item,
4936                                                     ioff + dsize, &token);
4937                 }
4938
4939                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4940                               btrfs_item_nr_offset(slot + nr),
4941                               sizeof(struct btrfs_item) *
4942                               (nritems - slot - nr));
4943         }
4944         btrfs_set_header_nritems(leaf, nritems - nr);
4945         nritems -= nr;
4946
4947         /* delete the leaf if we've emptied it */
4948         if (nritems == 0) {
4949                 if (leaf == root->node) {
4950                         btrfs_set_header_level(leaf, 0);
4951                 } else {
4952                         btrfs_set_path_blocking(path);
4953                         btrfs_clean_tree_block(leaf);
4954                         btrfs_del_leaf(trans, root, path, leaf);
4955                 }
4956         } else {
4957                 int used = leaf_space_used(leaf, 0, nritems);
4958                 if (slot == 0) {
4959                         struct btrfs_disk_key disk_key;
4960
4961                         btrfs_item_key(leaf, &disk_key, 0);
4962                         fixup_low_keys(path, &disk_key, 1);
4963                 }
4964
4965                 /* delete the leaf if it is mostly empty */
4966                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4967                         /* push_leaf_left fixes the path.
4968                          * make sure the path still points to our leaf
4969                          * for possible call to del_ptr below
4970                          */
4971                         slot = path->slots[1];
4972                         atomic_inc(&leaf->refs);
4973
4974                         btrfs_set_path_blocking(path);
4975                         wret = push_leaf_left(trans, root, path, 1, 1,
4976                                               1, (u32)-1);
4977                         if (wret < 0 && wret != -ENOSPC)
4978                                 ret = wret;
4979
4980                         if (path->nodes[0] == leaf &&
4981                             btrfs_header_nritems(leaf)) {
4982                                 wret = push_leaf_right(trans, root, path, 1,
4983                                                        1, 1, 0);
4984                                 if (wret < 0 && wret != -ENOSPC)
4985                                         ret = wret;
4986                         }
4987
4988                         if (btrfs_header_nritems(leaf) == 0) {
4989                                 path->slots[1] = slot;
4990                                 btrfs_del_leaf(trans, root, path, leaf);
4991                                 free_extent_buffer(leaf);
4992                                 ret = 0;
4993                         } else {
4994                                 /* if we're still in the path, make sure
4995                                  * we're dirty.  Otherwise, one of the
4996                                  * push_leaf functions must have already
4997                                  * dirtied this buffer
4998                                  */
4999                                 if (path->nodes[0] == leaf)
5000                                         btrfs_mark_buffer_dirty(leaf);
5001                                 free_extent_buffer(leaf);
5002                         }
5003                 } else {
5004                         btrfs_mark_buffer_dirty(leaf);
5005                 }
5006         }
5007         return ret;
5008 }
5009
5010 /*
5011  * search the tree again to find a leaf with lesser keys
5012  * returns 0 if it found something or 1 if there are no lesser leaves.
5013  * returns < 0 on io errors.
5014  *
5015  * This may release the path, and so you may lose any locks held at the
5016  * time you call it.
5017  */
5018 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5019 {
5020         struct btrfs_key key;
5021         struct btrfs_disk_key found_key;
5022         int ret;
5023
5024         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5025
5026         if (key.offset > 0) {
5027                 key.offset--;
5028         } else if (key.type > 0) {
5029                 key.type--;
5030                 key.offset = (u64)-1;
5031         } else if (key.objectid > 0) {
5032                 key.objectid--;
5033                 key.type = (u8)-1;
5034                 key.offset = (u64)-1;
5035         } else {
5036                 return 1;
5037         }
5038
5039         btrfs_release_path(path);
5040         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5041         if (ret < 0)
5042                 return ret;
5043         btrfs_item_key(path->nodes[0], &found_key, 0);
5044         ret = comp_keys(&found_key, &key);
5045         /*
5046          * We might have had an item with the previous key in the tree right
5047          * before we released our path. And after we released our path, that
5048          * item might have been pushed to the first slot (0) of the leaf we
5049          * were holding due to a tree balance. Alternatively, an item with the
5050          * previous key can exist as the only element of a leaf (big fat item).
5051          * Therefore account for these 2 cases, so that our callers (like
5052          * btrfs_previous_item) don't miss an existing item with a key matching
5053          * the previous key we computed above.
5054          */
5055         if (ret <= 0)
5056                 return 0;
5057         return 1;
5058 }
5059
5060 /*
5061  * A helper function to walk down the tree starting at min_key, and looking
5062  * for nodes or leaves that are have a minimum transaction id.
5063  * This is used by the btree defrag code, and tree logging
5064  *
5065  * This does not cow, but it does stuff the starting key it finds back
5066  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5067  * key and get a writable path.
5068  *
5069  * This honors path->lowest_level to prevent descent past a given level
5070  * of the tree.
5071  *
5072  * min_trans indicates the oldest transaction that you are interested
5073  * in walking through.  Any nodes or leaves older than min_trans are
5074  * skipped over (without reading them).
5075  *
5076  * returns zero if something useful was found, < 0 on error and 1 if there
5077  * was nothing in the tree that matched the search criteria.
5078  */
5079 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5080                          struct btrfs_path *path,
5081                          u64 min_trans)
5082 {
5083         struct extent_buffer *cur;
5084         struct btrfs_key found_key;
5085         int slot;
5086         int sret;
5087         u32 nritems;
5088         int level;
5089         int ret = 1;
5090         int keep_locks = path->keep_locks;
5091
5092         path->keep_locks = 1;
5093 again:
5094         cur = btrfs_read_lock_root_node(root);
5095         level = btrfs_header_level(cur);
5096         WARN_ON(path->nodes[level]);
5097         path->nodes[level] = cur;
5098         path->locks[level] = BTRFS_READ_LOCK;
5099
5100         if (btrfs_header_generation(cur) < min_trans) {
5101                 ret = 1;
5102                 goto out;
5103         }
5104         while (1) {
5105                 nritems = btrfs_header_nritems(cur);
5106                 level = btrfs_header_level(cur);
5107                 sret = btrfs_bin_search(cur, min_key, level, &slot);
5108                 if (sret < 0) {
5109                         ret = sret;
5110                         goto out;
5111                 }
5112
5113                 /* at the lowest level, we're done, setup the path and exit */
5114                 if (level == path->lowest_level) {
5115                         if (slot >= nritems)
5116                                 goto find_next_key;
5117                         ret = 0;
5118                         path->slots[level] = slot;
5119                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5120                         goto out;
5121                 }
5122                 if (sret && slot > 0)
5123                         slot--;
5124                 /*
5125                  * check this node pointer against the min_trans parameters.
5126                  * If it is too old, old, skip to the next one.
5127                  */
5128                 while (slot < nritems) {
5129                         u64 gen;
5130
5131                         gen = btrfs_node_ptr_generation(cur, slot);
5132                         if (gen < min_trans) {
5133                                 slot++;
5134                                 continue;
5135                         }
5136                         break;
5137                 }
5138 find_next_key:
5139                 /*
5140                  * we didn't find a candidate key in this node, walk forward
5141                  * and find another one
5142                  */
5143                 if (slot >= nritems) {
5144                         path->slots[level] = slot;
5145                         btrfs_set_path_blocking(path);
5146                         sret = btrfs_find_next_key(root, path, min_key, level,
5147                                                   min_trans);
5148                         if (sret == 0) {
5149                                 btrfs_release_path(path);
5150                                 goto again;
5151                         } else {
5152                                 goto out;
5153                         }
5154                 }
5155                 /* save our key for returning back */
5156                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5157                 path->slots[level] = slot;
5158                 if (level == path->lowest_level) {
5159                         ret = 0;
5160                         goto out;
5161                 }
5162                 btrfs_set_path_blocking(path);
5163                 cur = btrfs_read_node_slot(cur, slot);
5164                 if (IS_ERR(cur)) {
5165                         ret = PTR_ERR(cur);
5166                         goto out;
5167                 }
5168
5169                 btrfs_tree_read_lock(cur);
5170
5171                 path->locks[level - 1] = BTRFS_READ_LOCK;
5172                 path->nodes[level - 1] = cur;
5173                 unlock_up(path, level, 1, 0, NULL);
5174         }
5175 out:
5176         path->keep_locks = keep_locks;
5177         if (ret == 0) {
5178                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5179                 btrfs_set_path_blocking(path);
5180                 memcpy(min_key, &found_key, sizeof(found_key));
5181         }
5182         return ret;
5183 }
5184
5185 /*
5186  * this is similar to btrfs_next_leaf, but does not try to preserve
5187  * and fixup the path.  It looks for and returns the next key in the
5188  * tree based on the current path and the min_trans parameters.
5189  *
5190  * 0 is returned if another key is found, < 0 if there are any errors
5191  * and 1 is returned if there are no higher keys in the tree
5192  *
5193  * path->keep_locks should be set to 1 on the search made before
5194  * calling this function.
5195  */
5196 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5197                         struct btrfs_key *key, int level, u64 min_trans)
5198 {
5199         int slot;
5200         struct extent_buffer *c;
5201
5202         WARN_ON(!path->keep_locks && !path->skip_locking);
5203         while (level < BTRFS_MAX_LEVEL) {
5204                 if (!path->nodes[level])
5205                         return 1;
5206
5207                 slot = path->slots[level] + 1;
5208                 c = path->nodes[level];
5209 next:
5210                 if (slot >= btrfs_header_nritems(c)) {
5211                         int ret;
5212                         int orig_lowest;
5213                         struct btrfs_key cur_key;
5214                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5215                             !path->nodes[level + 1])
5216                                 return 1;
5217
5218                         if (path->locks[level + 1] || path->skip_locking) {
5219                                 level++;
5220                                 continue;
5221                         }
5222
5223                         slot = btrfs_header_nritems(c) - 1;
5224                         if (level == 0)
5225                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5226                         else
5227                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5228
5229                         orig_lowest = path->lowest_level;
5230                         btrfs_release_path(path);
5231                         path->lowest_level = level;
5232                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5233                                                 0, 0);
5234                         path->lowest_level = orig_lowest;
5235                         if (ret < 0)
5236                                 return ret;
5237
5238                         c = path->nodes[level];
5239                         slot = path->slots[level];
5240                         if (ret == 0)
5241                                 slot++;
5242                         goto next;
5243                 }
5244
5245                 if (level == 0)
5246                         btrfs_item_key_to_cpu(c, key, slot);
5247                 else {
5248                         u64 gen = btrfs_node_ptr_generation(c, slot);
5249
5250                         if (gen < min_trans) {
5251                                 slot++;
5252                                 goto next;
5253                         }
5254                         btrfs_node_key_to_cpu(c, key, slot);
5255                 }
5256                 return 0;
5257         }
5258         return 1;
5259 }
5260
5261 /*
5262  * search the tree again to find a leaf with greater keys
5263  * returns 0 if it found something or 1 if there are no greater leaves.
5264  * returns < 0 on io errors.
5265  */
5266 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5267 {
5268         return btrfs_next_old_leaf(root, path, 0);
5269 }
5270
5271 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5272                         u64 time_seq)
5273 {
5274         int slot;
5275         int level;
5276         struct extent_buffer *c;
5277         struct extent_buffer *next;
5278         struct btrfs_key key;
5279         u32 nritems;
5280         int ret;
5281         int old_spinning = path->leave_spinning;
5282         int next_rw_lock = 0;
5283
5284         nritems = btrfs_header_nritems(path->nodes[0]);
5285         if (nritems == 0)
5286                 return 1;
5287
5288         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5289 again:
5290         level = 1;
5291         next = NULL;
5292         next_rw_lock = 0;
5293         btrfs_release_path(path);
5294
5295         path->keep_locks = 1;
5296         path->leave_spinning = 1;
5297
5298         if (time_seq)
5299                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5300         else
5301                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5302         path->keep_locks = 0;
5303
5304         if (ret < 0)
5305                 return ret;
5306
5307         nritems = btrfs_header_nritems(path->nodes[0]);
5308         /*
5309          * by releasing the path above we dropped all our locks.  A balance
5310          * could have added more items next to the key that used to be
5311          * at the very end of the block.  So, check again here and
5312          * advance the path if there are now more items available.
5313          */
5314         if (nritems > 0 && path->slots[0] < nritems - 1) {
5315                 if (ret == 0)
5316                         path->slots[0]++;
5317                 ret = 0;
5318                 goto done;
5319         }
5320         /*
5321          * So the above check misses one case:
5322          * - after releasing the path above, someone has removed the item that
5323          *   used to be at the very end of the block, and balance between leafs
5324          *   gets another one with bigger key.offset to replace it.
5325          *
5326          * This one should be returned as well, or we can get leaf corruption
5327          * later(esp. in __btrfs_drop_extents()).
5328          *
5329          * And a bit more explanation about this check,
5330          * with ret > 0, the key isn't found, the path points to the slot
5331          * where it should be inserted, so the path->slots[0] item must be the
5332          * bigger one.
5333          */
5334         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5335                 ret = 0;
5336                 goto done;
5337         }
5338
5339         while (level < BTRFS_MAX_LEVEL) {
5340                 if (!path->nodes[level]) {
5341                         ret = 1;
5342                         goto done;
5343                 }
5344
5345                 slot = path->slots[level] + 1;
5346                 c = path->nodes[level];
5347                 if (slot >= btrfs_header_nritems(c)) {
5348                         level++;
5349                         if (level == BTRFS_MAX_LEVEL) {
5350                                 ret = 1;
5351                                 goto done;
5352                         }
5353                         continue;
5354                 }
5355
5356                 if (next) {
5357                         btrfs_tree_unlock_rw(next, next_rw_lock);
5358                         free_extent_buffer(next);
5359                 }
5360
5361                 next = c;
5362                 next_rw_lock = path->locks[level];
5363                 ret = read_block_for_search(root, path, &next, level,
5364                                             slot, &key);
5365                 if (ret == -EAGAIN)
5366                         goto again;
5367
5368                 if (ret < 0) {
5369                         btrfs_release_path(path);
5370                         goto done;
5371                 }
5372
5373                 if (!path->skip_locking) {
5374                         ret = btrfs_try_tree_read_lock(next);
5375                         if (!ret && time_seq) {
5376                                 /*
5377                                  * If we don't get the lock, we may be racing
5378                                  * with push_leaf_left, holding that lock while
5379                                  * itself waiting for the leaf we've currently
5380                                  * locked. To solve this situation, we give up
5381                                  * on our lock and cycle.
5382                                  */
5383                                 free_extent_buffer(next);
5384                                 btrfs_release_path(path);
5385                                 cond_resched();
5386                                 goto again;
5387                         }
5388                         if (!ret) {
5389                                 btrfs_set_path_blocking(path);
5390                                 btrfs_tree_read_lock(next);
5391                         }
5392                         next_rw_lock = BTRFS_READ_LOCK;
5393                 }
5394                 break;
5395         }
5396         path->slots[level] = slot;
5397         while (1) {
5398                 level--;
5399                 c = path->nodes[level];
5400                 if (path->locks[level])
5401                         btrfs_tree_unlock_rw(c, path->locks[level]);
5402
5403                 free_extent_buffer(c);
5404                 path->nodes[level] = next;
5405                 path->slots[level] = 0;
5406                 if (!path->skip_locking)
5407                         path->locks[level] = next_rw_lock;
5408                 if (!level)
5409                         break;
5410
5411                 ret = read_block_for_search(root, path, &next, level,
5412                                             0, &key);
5413                 if (ret == -EAGAIN)
5414                         goto again;
5415
5416                 if (ret < 0) {
5417                         btrfs_release_path(path);
5418                         goto done;
5419                 }
5420
5421                 if (!path->skip_locking) {
5422                         ret = btrfs_try_tree_read_lock(next);
5423                         if (!ret) {
5424                                 btrfs_set_path_blocking(path);
5425                                 btrfs_tree_read_lock(next);
5426                         }
5427                         next_rw_lock = BTRFS_READ_LOCK;
5428                 }
5429         }
5430         ret = 0;
5431 done:
5432         unlock_up(path, 0, 1, 0, NULL);
5433         path->leave_spinning = old_spinning;
5434         if (!old_spinning)
5435                 btrfs_set_path_blocking(path);
5436
5437         return ret;
5438 }
5439
5440 /*
5441  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5442  * searching until it gets past min_objectid or finds an item of 'type'
5443  *
5444  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5445  */
5446 int btrfs_previous_item(struct btrfs_root *root,
5447                         struct btrfs_path *path, u64 min_objectid,
5448                         int type)
5449 {
5450         struct btrfs_key found_key;
5451         struct extent_buffer *leaf;
5452         u32 nritems;
5453         int ret;
5454
5455         while (1) {
5456                 if (path->slots[0] == 0) {
5457                         btrfs_set_path_blocking(path);
5458                         ret = btrfs_prev_leaf(root, path);
5459                         if (ret != 0)
5460                                 return ret;
5461                 } else {
5462                         path->slots[0]--;
5463                 }
5464                 leaf = path->nodes[0];
5465                 nritems = btrfs_header_nritems(leaf);
5466                 if (nritems == 0)
5467                         return 1;
5468                 if (path->slots[0] == nritems)
5469                         path->slots[0]--;
5470
5471                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5472                 if (found_key.objectid < min_objectid)
5473                         break;
5474                 if (found_key.type == type)
5475                         return 0;
5476                 if (found_key.objectid == min_objectid &&
5477                     found_key.type < type)
5478                         break;
5479         }
5480         return 1;
5481 }
5482
5483 /*
5484  * search in extent tree to find a previous Metadata/Data extent item with
5485  * min objecitd.
5486  *
5487  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5488  */
5489 int btrfs_previous_extent_item(struct btrfs_root *root,
5490                         struct btrfs_path *path, u64 min_objectid)
5491 {
5492         struct btrfs_key found_key;
5493         struct extent_buffer *leaf;
5494         u32 nritems;
5495         int ret;
5496
5497         while (1) {
5498                 if (path->slots[0] == 0) {
5499                         btrfs_set_path_blocking(path);
5500                         ret = btrfs_prev_leaf(root, path);
5501                         if (ret != 0)
5502                                 return ret;
5503                 } else {
5504                         path->slots[0]--;
5505                 }
5506                 leaf = path->nodes[0];
5507                 nritems = btrfs_header_nritems(leaf);
5508                 if (nritems == 0)
5509                         return 1;
5510                 if (path->slots[0] == nritems)
5511                         path->slots[0]--;
5512
5513                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5514                 if (found_key.objectid < min_objectid)
5515                         break;
5516                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5517                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5518                         return 0;
5519                 if (found_key.objectid == min_objectid &&
5520                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5521                         break;
5522         }
5523         return 1;
5524 }
This page took 0.337336 seconds and 4 git commands to generate.