]> Git Repo - J-linux.git/blob - fs/bcachefs/fsck.c
Merge patch series "riscv: Extension parsing fixes"
[J-linux.git] / fs / bcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery_passes.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28                                     u32 snapshot)
29 {
30         u64 sectors = 0;
31
32         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33                                 SPOS(inum, 0, snapshot),
34                                 POS(inum, U64_MAX),
35                                 0, k, ({
36                 if (bkey_extent_is_allocation(k.k))
37                         sectors += k.k->size;
38                 0;
39         }));
40
41         return ret ?: sectors;
42 }
43
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45                                     u32 snapshot)
46 {
47         u64 subdirs = 0;
48
49         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50                                     SPOS(inum, 0, snapshot),
51                                     POS(inum, U64_MAX),
52                                     0, k, ({
53                 if (k.k->type == KEY_TYPE_dirent &&
54                     bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55                         subdirs++;
56                 0;
57         }));
58
59         return ret ?: subdirs;
60 }
61
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63                          u32 *snapshot, u64 *inum)
64 {
65         struct bch_subvolume s;
66         int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
67
68         *snapshot = le32_to_cpu(s.snapshot);
69         *inum = le64_to_cpu(s.inode);
70         return ret;
71 }
72
73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
74                               struct bch_inode_unpacked *inode)
75 {
76         struct btree_iter iter;
77         struct bkey_s_c k;
78         int ret;
79
80         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
81                              POS(0, inode_nr),
82                              BTREE_ITER_all_snapshots);
83         k = bch2_btree_iter_peek(&iter);
84         ret = bkey_err(k);
85         if (ret)
86                 goto err;
87
88         if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
89                 ret = -BCH_ERR_ENOENT_inode;
90                 goto err;
91         }
92
93         ret = bch2_inode_unpack(k, inode);
94 err:
95         bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
96         bch2_trans_iter_exit(trans, &iter);
97         return ret;
98 }
99
100 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
101                         struct bch_inode_unpacked *inode,
102                         u32 *snapshot)
103 {
104         struct btree_iter iter;
105         struct bkey_s_c k;
106         int ret;
107
108         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
109                                SPOS(0, inode_nr, *snapshot), 0);
110         ret = bkey_err(k);
111         if (ret)
112                 goto err;
113
114         ret = bkey_is_inode(k.k)
115                 ? bch2_inode_unpack(k, inode)
116                 : -BCH_ERR_ENOENT_inode;
117         if (!ret)
118                 *snapshot = iter.pos.snapshot;
119 err:
120         bch2_trans_iter_exit(trans, &iter);
121         return ret;
122 }
123
124 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
125                            struct bch_hash_info hash_info,
126                            subvol_inum dir, struct qstr *name,
127                            u64 *target, unsigned *type, u32 snapshot)
128 {
129         struct btree_iter iter;
130         struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
131                                                          &hash_info, dir, name, 0, snapshot);
132         int ret = bkey_err(k);
133         if (ret)
134                 return ret;
135
136         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
137         *target = le64_to_cpu(d.v->d_inum);
138         *type = d.v->d_type;
139         bch2_trans_iter_exit(trans, &iter);
140         return 0;
141 }
142
143 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
144 {
145         struct bch_fs *c = trans->c;
146         struct btree_iter iter;
147         struct bch_inode_unpacked dir_inode;
148         struct bch_hash_info dir_hash_info;
149         int ret;
150
151         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
152         if (ret)
153                 goto err;
154
155         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
156
157         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
158
159         ret =   bch2_btree_iter_traverse(&iter) ?:
160                 bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
161                                     &dir_hash_info, &iter,
162                                     BTREE_UPDATE_internal_snapshot_node);
163         bch2_trans_iter_exit(trans, &iter);
164 err:
165         bch_err_fn(c, ret);
166         return ret;
167 }
168
169 /* Get lost+found, create if it doesn't exist: */
170 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
171                             struct bch_inode_unpacked *lostfound,
172                             u64 reattaching_inum)
173 {
174         struct bch_fs *c = trans->c;
175         struct qstr lostfound_str = QSTR("lost+found");
176         u64 inum = 0;
177         unsigned d_type = 0;
178         int ret;
179
180         struct bch_snapshot_tree st;
181         ret = bch2_snapshot_tree_lookup(trans,
182                         bch2_snapshot_tree(c, snapshot), &st);
183         if (ret)
184                 return ret;
185
186         subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
187
188         struct bch_subvolume subvol;
189         ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
190                                  false, 0, &subvol);
191         bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
192                     le32_to_cpu(st.master_subvol), snapshot);
193         if (ret)
194                 return ret;
195
196         if (!subvol.inode) {
197                 struct btree_iter iter;
198                 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
199                                 BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
200                                 0, subvolume);
201                 ret = PTR_ERR_OR_ZERO(subvol);
202                 if (ret)
203                         return ret;
204
205                 subvol->v.inode = cpu_to_le64(reattaching_inum);
206                 bch2_trans_iter_exit(trans, &iter);
207         }
208
209         root_inum.inum = le64_to_cpu(subvol.inode);
210
211         struct bch_inode_unpacked root_inode;
212         struct bch_hash_info root_hash_info;
213         u32 root_inode_snapshot = snapshot;
214         ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
215         bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
216                     root_inum.inum, le32_to_cpu(st.master_subvol));
217         if (ret)
218                 return ret;
219
220         root_hash_info = bch2_hash_info_init(c, &root_inode);
221
222         ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
223                               &lostfound_str, &inum, &d_type, snapshot);
224         if (bch2_err_matches(ret, ENOENT))
225                 goto create_lostfound;
226
227         bch_err_fn(c, ret);
228         if (ret)
229                 return ret;
230
231         if (d_type != DT_DIR) {
232                 bch_err(c, "error looking up lost+found: not a directory");
233                 return -BCH_ERR_ENOENT_not_directory;
234         }
235
236         /*
237          * The bch2_check_dirents pass has already run, dangling dirents
238          * shouldn't exist here:
239          */
240         ret = lookup_inode(trans, inum, lostfound, &snapshot);
241         bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
242                     inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
243         return ret;
244
245 create_lostfound:
246         /*
247          * XXX: we could have a nicer log message here  if we had a nice way to
248          * walk backpointers to print a path
249          */
250         bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
251
252         u64 now = bch2_current_time(c);
253         struct btree_iter lostfound_iter = { NULL };
254         u64 cpu = raw_smp_processor_id();
255
256         bch2_inode_init_early(c, lostfound);
257         bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
258         lostfound->bi_dir = root_inode.bi_inum;
259
260         root_inode.bi_nlink++;
261
262         ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
263         if (ret)
264                 goto err;
265
266         bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
267         ret = bch2_btree_iter_traverse(&lostfound_iter);
268         if (ret)
269                 goto err;
270
271         ret =   bch2_dirent_create_snapshot(trans,
272                                 0, root_inode.bi_inum, snapshot, &root_hash_info,
273                                 mode_to_type(lostfound->bi_mode),
274                                 &lostfound_str,
275                                 lostfound->bi_inum,
276                                 &lostfound->bi_dir_offset,
277                                 STR_HASH_must_create) ?:
278                 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
279                                        BTREE_UPDATE_internal_snapshot_node);
280 err:
281         bch_err_msg(c, ret, "creating lost+found");
282         bch2_trans_iter_exit(trans, &lostfound_iter);
283         return ret;
284 }
285
286 static int reattach_inode(struct btree_trans *trans,
287                           struct bch_inode_unpacked *inode,
288                           u32 inode_snapshot)
289 {
290         struct bch_hash_info dir_hash;
291         struct bch_inode_unpacked lostfound;
292         char name_buf[20];
293         struct qstr name;
294         u64 dir_offset = 0;
295         u32 dirent_snapshot = inode_snapshot;
296         int ret;
297
298         if (inode->bi_subvol) {
299                 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
300
301                 u64 root_inum;
302                 ret = subvol_lookup(trans, inode->bi_parent_subvol,
303                                     &dirent_snapshot, &root_inum);
304                 if (ret)
305                         return ret;
306
307                 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
308         } else {
309                 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
310         }
311
312         ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
313         if (ret)
314                 return ret;
315
316         if (S_ISDIR(inode->bi_mode)) {
317                 lostfound.bi_nlink++;
318
319                 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
320                 if (ret)
321                         return ret;
322         }
323
324         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
325
326         name = (struct qstr) QSTR(name_buf);
327
328         ret = bch2_dirent_create_snapshot(trans,
329                                 inode->bi_parent_subvol, lostfound.bi_inum,
330                                 dirent_snapshot,
331                                 &dir_hash,
332                                 inode_d_type(inode),
333                                 &name,
334                                 inode->bi_subvol ?: inode->bi_inum,
335                                 &dir_offset,
336                                 STR_HASH_must_create);
337         if (ret)
338                 return ret;
339
340         inode->bi_dir           = lostfound.bi_inum;
341         inode->bi_dir_offset    = dir_offset;
342
343         return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
344 }
345
346 static int remove_backpointer(struct btree_trans *trans,
347                               struct bch_inode_unpacked *inode)
348 {
349         struct btree_iter iter;
350         struct bkey_s_c_dirent d;
351         int ret;
352
353         d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
354                                      POS(inode->bi_dir, inode->bi_dir_offset), 0,
355                                      dirent);
356         ret =   bkey_err(d) ?:
357                 __remove_dirent(trans, d.k->p);
358         bch2_trans_iter_exit(trans, &iter);
359         return ret;
360 }
361
362 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
363 {
364         struct bch_fs *c = trans->c;
365
366         struct bch_inode_unpacked inode;
367         int ret = bch2_inode_find_by_inum_trans(trans,
368                                 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
369                                 &inode);
370         if (ret)
371                 return ret;
372
373         ret = remove_backpointer(trans, &inode);
374         bch_err_msg(c, ret, "removing dirent");
375         if (ret)
376                 return ret;
377
378         ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
379         bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
380         return ret;
381 }
382
383 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
384 {
385         struct bch_fs *c = trans->c;
386
387         if (!bch2_snapshot_is_leaf(c, snapshotid)) {
388                 bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
389                 return -BCH_ERR_fsck_repair_unimplemented;
390         }
391
392         /*
393          * If inum isn't set, that means we're being called from check_dirents,
394          * not check_inodes - the root of this subvolume doesn't exist or we
395          * would have found it there:
396          */
397         if (!inum) {
398                 struct btree_iter inode_iter = {};
399                 struct bch_inode_unpacked new_inode;
400                 u64 cpu = raw_smp_processor_id();
401
402                 bch2_inode_init_early(c, &new_inode);
403                 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
404
405                 new_inode.bi_subvol = subvolid;
406
407                 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
408                           bch2_btree_iter_traverse(&inode_iter) ?:
409                           bch2_inode_write(trans, &inode_iter, &new_inode);
410                 bch2_trans_iter_exit(trans, &inode_iter);
411                 if (ret)
412                         return ret;
413
414                 inum = new_inode.bi_inum;
415         }
416
417         bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
418
419         struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
420         int ret = PTR_ERR_OR_ZERO(new_subvol);
421         if (ret)
422                 return ret;
423
424         bkey_subvolume_init(&new_subvol->k_i);
425         new_subvol->k.p.offset  = subvolid;
426         new_subvol->v.snapshot  = cpu_to_le32(snapshotid);
427         new_subvol->v.inode     = cpu_to_le64(inum);
428         ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
429         if (ret)
430                 return ret;
431
432         struct btree_iter iter;
433         struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
434                         BTREE_ID_snapshots, POS(0, snapshotid),
435                         0, snapshot);
436         ret = PTR_ERR_OR_ZERO(s);
437         bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
438         if (ret)
439                 return ret;
440
441         u32 snapshot_tree = le32_to_cpu(s->v.tree);
442
443         s->v.subvol = cpu_to_le32(subvolid);
444         SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
445         bch2_trans_iter_exit(trans, &iter);
446
447         struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
448                         BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
449                         0, snapshot_tree);
450         ret = PTR_ERR_OR_ZERO(st);
451         bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
452         if (ret)
453                 return ret;
454
455         if (!st->v.master_subvol)
456                 st->v.master_subvol = cpu_to_le32(subvolid);
457
458         bch2_trans_iter_exit(trans, &iter);
459         return 0;
460 }
461
462 static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
463 {
464         struct bch_fs *c = trans->c;
465         struct bch_inode_unpacked new_inode;
466
467         bch2_inode_init_early(c, &new_inode);
468         bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
469         new_inode.bi_size = size;
470         new_inode.bi_inum = inum;
471
472         return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
473 }
474
475 static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
476 {
477         struct btree_iter iter = {};
478
479         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
480         struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
481         bch2_trans_iter_exit(trans, &iter);
482         int ret = bkey_err(k);
483         if (ret)
484                 return ret;
485
486         return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
487 }
488
489 struct snapshots_seen {
490         struct bpos                     pos;
491         snapshot_id_list                ids;
492 };
493
494 static inline void snapshots_seen_exit(struct snapshots_seen *s)
495 {
496         darray_exit(&s->ids);
497 }
498
499 static inline void snapshots_seen_init(struct snapshots_seen *s)
500 {
501         memset(s, 0, sizeof(*s));
502 }
503
504 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
505 {
506         u32 *i;
507         __darray_for_each(s->ids, i) {
508                 if (*i == id)
509                         return 0;
510                 if (*i > id)
511                         break;
512         }
513
514         int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
515         if (ret)
516                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
517                         s->ids.size);
518         return ret;
519 }
520
521 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
522                                  enum btree_id btree_id, struct bpos pos)
523 {
524         if (!bkey_eq(s->pos, pos))
525                 s->ids.nr = 0;
526         s->pos = pos;
527
528         return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
529 }
530
531 /**
532  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
533  * and @ancestor hasn't been overwritten in @seen
534  *
535  * @c:          filesystem handle
536  * @seen:       list of snapshot ids already seen at current position
537  * @id:         descendent snapshot id
538  * @ancestor:   ancestor snapshot id
539  *
540  * Returns:     whether key in @ancestor snapshot is visible in @id snapshot
541  */
542 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
543                                     u32 id, u32 ancestor)
544 {
545         ssize_t i;
546
547         EBUG_ON(id > ancestor);
548
549         /* @ancestor should be the snapshot most recently added to @seen */
550         EBUG_ON(ancestor != seen->pos.snapshot);
551         EBUG_ON(ancestor != darray_last(seen->ids));
552
553         if (id == ancestor)
554                 return true;
555
556         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
557                 return false;
558
559         /*
560          * We know that @id is a descendant of @ancestor, we're checking if
561          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
562          * @ascestor and with @id as a descendent.
563          *
564          * But we already know that we're scanning IDs between @id and @ancestor
565          * numerically, since snapshot ID lists are kept sorted, so if we find
566          * an id that's an ancestor of @id we're done:
567          */
568
569         for (i = seen->ids.nr - 2;
570              i >= 0 && seen->ids.data[i] >= id;
571              --i)
572                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
573                         return false;
574
575         return true;
576 }
577
578 /**
579  * ref_visible - given a key with snapshot id @src that points to a key with
580  * snapshot id @dst, test whether there is some snapshot in which @dst is
581  * visible.
582  *
583  * @c:          filesystem handle
584  * @s:          list of snapshot IDs already seen at @src
585  * @src:        snapshot ID of src key
586  * @dst:        snapshot ID of dst key
587  * Returns:     true if there is some snapshot in which @dst is visible
588  *
589  * Assumes we're visiting @src keys in natural key order
590  */
591 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
592                         u32 src, u32 dst)
593 {
594         return dst <= src
595                 ? key_visible_in_snapshot(c, s, dst, src)
596                 : bch2_snapshot_is_ancestor(c, src, dst);
597 }
598
599 static int ref_visible2(struct bch_fs *c,
600                         u32 src, struct snapshots_seen *src_seen,
601                         u32 dst, struct snapshots_seen *dst_seen)
602 {
603         if (dst > src) {
604                 swap(dst, src);
605                 swap(dst_seen, src_seen);
606         }
607         return key_visible_in_snapshot(c, src_seen, dst, src);
608 }
609
610 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
611         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
612              (_i)->snapshot <= (_snapshot); _i++)                                       \
613                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
614
615 struct inode_walker_entry {
616         struct bch_inode_unpacked inode;
617         u32                     snapshot;
618         bool                    seen_this_pos;
619         u64                     count;
620 };
621
622 struct inode_walker {
623         bool                            first_this_inode;
624         bool                            recalculate_sums;
625         struct bpos                     last_pos;
626
627         DARRAY(struct inode_walker_entry) inodes;
628 };
629
630 static void inode_walker_exit(struct inode_walker *w)
631 {
632         darray_exit(&w->inodes);
633 }
634
635 static struct inode_walker inode_walker_init(void)
636 {
637         return (struct inode_walker) { 0, };
638 }
639
640 static int add_inode(struct bch_fs *c, struct inode_walker *w,
641                      struct bkey_s_c inode)
642 {
643         struct bch_inode_unpacked u;
644
645         BUG_ON(bch2_inode_unpack(inode, &u));
646
647         return darray_push(&w->inodes, ((struct inode_walker_entry) {
648                 .inode          = u,
649                 .snapshot       = inode.k->p.snapshot,
650         }));
651 }
652
653 static int get_inodes_all_snapshots(struct btree_trans *trans,
654                                     struct inode_walker *w, u64 inum)
655 {
656         struct bch_fs *c = trans->c;
657         struct btree_iter iter;
658         struct bkey_s_c k;
659         int ret;
660
661         w->recalculate_sums = false;
662         w->inodes.nr = 0;
663
664         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
665                                      BTREE_ITER_all_snapshots, k, ret) {
666                 if (k.k->p.offset != inum)
667                         break;
668
669                 if (bkey_is_inode(k.k))
670                         add_inode(c, w, k);
671         }
672         bch2_trans_iter_exit(trans, &iter);
673
674         if (ret)
675                 return ret;
676
677         w->first_this_inode = true;
678         return 0;
679 }
680
681 static struct inode_walker_entry *
682 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
683 {
684         bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
685
686         struct inode_walker_entry *i;
687         __darray_for_each(w->inodes, i)
688                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
689                         goto found;
690
691         return NULL;
692 found:
693         BUG_ON(k.k->p.snapshot > i->snapshot);
694
695         if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
696                 struct inode_walker_entry new = *i;
697
698                 new.snapshot = k.k->p.snapshot;
699                 new.count = 0;
700
701                 struct printbuf buf = PRINTBUF;
702                 bch2_bkey_val_to_text(&buf, c, k);
703
704                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
705                          "unexpected because we should always update the inode when we update a key in that inode\n"
706                          "%s",
707                          w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
708                 printbuf_exit(&buf);
709
710                 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
711                         --i;
712
713                 size_t pos = i - w->inodes.data;
714                 int ret = darray_insert_item(&w->inodes, pos, new);
715                 if (ret)
716                         return ERR_PTR(ret);
717
718                 i = w->inodes.data + pos;
719         }
720
721         return i;
722 }
723
724 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
725                                              struct inode_walker *w,
726                                              struct bkey_s_c k)
727 {
728         if (w->last_pos.inode != k.k->p.inode) {
729                 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
730                 if (ret)
731                         return ERR_PTR(ret);
732         } else if (bkey_cmp(w->last_pos, k.k->p)) {
733                 darray_for_each(w->inodes, i)
734                         i->seen_this_pos = false;
735         }
736
737         w->last_pos = k.k->p;
738
739         return lookup_inode_for_snapshot(trans->c, w, k);
740 }
741
742 static int get_visible_inodes(struct btree_trans *trans,
743                               struct inode_walker *w,
744                               struct snapshots_seen *s,
745                               u64 inum)
746 {
747         struct bch_fs *c = trans->c;
748         struct btree_iter iter;
749         struct bkey_s_c k;
750         int ret;
751
752         w->inodes.nr = 0;
753
754         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
755                            BTREE_ITER_all_snapshots, k, ret) {
756                 if (k.k->p.offset != inum)
757                         break;
758
759                 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
760                         continue;
761
762                 if (bkey_is_inode(k.k))
763                         add_inode(c, w, k);
764
765                 if (k.k->p.snapshot >= s->pos.snapshot)
766                         break;
767         }
768         bch2_trans_iter_exit(trans, &iter);
769
770         return ret;
771 }
772
773 static int check_key_has_snapshot(struct btree_trans *trans,
774                                   struct btree_iter *iter,
775                                   struct bkey_s_c k)
776 {
777         struct bch_fs *c = trans->c;
778         struct printbuf buf = PRINTBUF;
779         int ret = 0;
780
781         if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
782                                 bkey_in_missing_snapshot,
783                                 "key in missing snapshot: %s",
784                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
785                 ret = bch2_btree_delete_at(trans, iter,
786                                             BTREE_UPDATE_internal_snapshot_node) ?: 1;
787 fsck_err:
788         printbuf_exit(&buf);
789         return ret;
790 }
791
792 static int hash_redo_key(struct btree_trans *trans,
793                          const struct bch_hash_desc desc,
794                          struct bch_hash_info *hash_info,
795                          struct btree_iter *k_iter, struct bkey_s_c k)
796 {
797         struct bkey_i *delete;
798         struct bkey_i *tmp;
799
800         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
801         if (IS_ERR(delete))
802                 return PTR_ERR(delete);
803
804         tmp = bch2_bkey_make_mut_noupdate(trans, k);
805         if (IS_ERR(tmp))
806                 return PTR_ERR(tmp);
807
808         bkey_init(&delete->k);
809         delete->k.p = k_iter->pos;
810         return  bch2_btree_iter_traverse(k_iter) ?:
811                 bch2_trans_update(trans, k_iter, delete, 0) ?:
812                 bch2_hash_set_in_snapshot(trans, desc, hash_info,
813                                        (subvol_inum) { 0, k.k->p.inode },
814                                        k.k->p.snapshot, tmp,
815                                        STR_HASH_must_create|
816                                        BTREE_UPDATE_internal_snapshot_node) ?:
817                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
818 }
819
820 static int hash_check_key(struct btree_trans *trans,
821                           const struct bch_hash_desc desc,
822                           struct bch_hash_info *hash_info,
823                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
824 {
825         struct bch_fs *c = trans->c;
826         struct btree_iter iter = { NULL };
827         struct printbuf buf = PRINTBUF;
828         struct bkey_s_c k;
829         u64 hash;
830         int ret = 0;
831
832         if (hash_k.k->type != desc.key_type)
833                 return 0;
834
835         hash = desc.hash_bkey(hash_info, hash_k);
836
837         if (likely(hash == hash_k.k->p.offset))
838                 return 0;
839
840         if (hash_k.k->p.offset < hash)
841                 goto bad_hash;
842
843         for_each_btree_key_norestart(trans, iter, desc.btree_id,
844                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
845                                      BTREE_ITER_slots, k, ret) {
846                 if (bkey_eq(k.k->p, hash_k.k->p))
847                         break;
848
849                 if (fsck_err_on(k.k->type == desc.key_type &&
850                                 !desc.cmp_bkey(k, hash_k), c,
851                                 hash_table_key_duplicate,
852                                 "duplicate hash table keys:\n%s",
853                                 (printbuf_reset(&buf),
854                                  bch2_bkey_val_to_text(&buf, c, hash_k),
855                                  buf.buf))) {
856                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
857                         break;
858                 }
859
860                 if (bkey_deleted(k.k)) {
861                         bch2_trans_iter_exit(trans, &iter);
862                         goto bad_hash;
863                 }
864         }
865 out:
866         bch2_trans_iter_exit(trans, &iter);
867         printbuf_exit(&buf);
868         return ret;
869 bad_hash:
870         if (fsck_err(c, hash_table_key_wrong_offset,
871                      "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
872                      bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
873                      (printbuf_reset(&buf),
874                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
875                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
876                 bch_err_fn(c, ret);
877                 if (ret)
878                         return ret;
879                 ret = -BCH_ERR_transaction_restart_nested;
880         }
881 fsck_err:
882         goto out;
883 }
884
885 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
886                                                 struct btree_iter *iter,
887                                                 struct bpos pos)
888 {
889         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
890 }
891
892 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
893                                                struct btree_iter *iter,
894                                                struct bch_inode_unpacked *inode,
895                                                u32 *snapshot)
896 {
897         if (inode->bi_subvol) {
898                 u64 inum;
899                 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
900                 if (ret)
901                         return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
902         }
903
904         return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
905 }
906
907 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
908                                    struct bkey_s_c_dirent d)
909 {
910         return  inode->bi_dir           == d.k->p.inode &&
911                 inode->bi_dir_offset    == d.k->p.offset;
912 }
913
914 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
915                                    struct bch_inode_unpacked *inode)
916 {
917         return d.v->d_type == DT_SUBVOL
918                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
919                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
920 }
921
922 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
923 {
924         struct btree_iter iter;
925         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
926         int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
927         bch2_trans_iter_exit(trans, &iter);
928         return ret;
929 }
930
931 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
932                                     struct bch_inode_unpacked *inode,
933                                     u32 inode_snapshot, bool *write_inode)
934 {
935         struct bch_fs *c = trans->c;
936         struct printbuf buf = PRINTBUF;
937
938         struct btree_iter dirent_iter = {};
939         struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
940         int ret = bkey_err(d);
941         if (ret && !bch2_err_matches(ret, ENOENT))
942                 return ret;
943
944         if (fsck_err_on(ret,
945                         c, inode_points_to_missing_dirent,
946                         "inode points to missing dirent\n%s",
947                         (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
948             fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
949                         c, inode_points_to_wrong_dirent,
950                         "inode points to dirent that does not point back:\n%s",
951                         (bch2_bkey_val_to_text(&buf, c, inode_k),
952                          prt_newline(&buf),
953                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
954                 /*
955                  * We just clear the backpointer fields for now. If we find a
956                  * dirent that points to this inode in check_dirents(), we'll
957                  * update it then; then when we get to check_path() if the
958                  * backpointer is still 0 we'll reattach it.
959                  */
960                 inode->bi_dir = 0;
961                 inode->bi_dir_offset = 0;
962                 inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
963                 *write_inode = true;
964         }
965
966         ret = 0;
967 fsck_err:
968         bch2_trans_iter_exit(trans, &dirent_iter);
969         printbuf_exit(&buf);
970         bch_err_fn(c, ret);
971         return ret;
972 }
973
974 static int check_inode(struct btree_trans *trans,
975                        struct btree_iter *iter,
976                        struct bkey_s_c k,
977                        struct bch_inode_unpacked *prev,
978                        struct snapshots_seen *s,
979                        bool full)
980 {
981         struct bch_fs *c = trans->c;
982         struct bch_inode_unpacked u;
983         bool do_update = false;
984         int ret;
985
986         ret = check_key_has_snapshot(trans, iter, k);
987         if (ret < 0)
988                 goto err;
989         if (ret)
990                 return 0;
991
992         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
993         if (ret)
994                 goto err;
995
996         if (!bkey_is_inode(k.k))
997                 return 0;
998
999         BUG_ON(bch2_inode_unpack(k, &u));
1000
1001         if (!full &&
1002             !(u.bi_flags & (BCH_INODE_i_size_dirty|
1003                             BCH_INODE_i_sectors_dirty|
1004                             BCH_INODE_unlinked)))
1005                 return 0;
1006
1007         if (prev->bi_inum != u.bi_inum)
1008                 *prev = u;
1009
1010         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
1011                         inode_d_type(prev)      != inode_d_type(&u),
1012                         c, inode_snapshot_mismatch,
1013                         "inodes in different snapshots don't match")) {
1014                 bch_err(c, "repair not implemented yet");
1015                 return -BCH_ERR_fsck_repair_unimplemented;
1016         }
1017
1018         if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1019             bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1020                 struct bpos new_min_pos;
1021
1022                 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1023                 if (ret)
1024                         goto err;
1025
1026                 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1027
1028                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1029
1030                 bch_err_msg(c, ret, "in fsck updating inode");
1031                 if (ret)
1032                         return ret;
1033
1034                 if (!bpos_eq(new_min_pos, POS_MIN))
1035                         bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1036                 return 0;
1037         }
1038
1039         if (u.bi_flags & BCH_INODE_unlinked) {
1040                 ret = check_inode_deleted_list(trans, k.k->p);
1041                 if (ret < 0)
1042                         return ret;
1043
1044                 fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
1045                             "inode %llu:%u unlinked, but not on deleted list",
1046                             u.bi_inum, k.k->p.snapshot);
1047                 ret = 0;
1048         }
1049
1050         if (u.bi_flags & BCH_INODE_unlinked &&
1051             (!c->sb.clean ||
1052              fsck_err(c, inode_unlinked_but_clean,
1053                       "filesystem marked clean, but inode %llu unlinked",
1054                       u.bi_inum))) {
1055                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1056                 bch_err_msg(c, ret, "in fsck deleting inode");
1057                 return ret;
1058         }
1059
1060         if (u.bi_flags & BCH_INODE_i_size_dirty &&
1061             (!c->sb.clean ||
1062              fsck_err(c, inode_i_size_dirty_but_clean,
1063                       "filesystem marked clean, but inode %llu has i_size dirty",
1064                       u.bi_inum))) {
1065                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
1066
1067                 /*
1068                  * XXX: need to truncate partial blocks too here - or ideally
1069                  * just switch units to bytes and that issue goes away
1070                  */
1071                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1072                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1073                                      iter->pos.snapshot),
1074                                 POS(u.bi_inum, U64_MAX),
1075                                 0, NULL);
1076                 bch_err_msg(c, ret, "in fsck truncating inode");
1077                 if (ret)
1078                         return ret;
1079
1080                 /*
1081                  * We truncated without our normal sector accounting hook, just
1082                  * make sure we recalculate it:
1083                  */
1084                 u.bi_flags |= BCH_INODE_i_sectors_dirty;
1085
1086                 u.bi_flags &= ~BCH_INODE_i_size_dirty;
1087                 do_update = true;
1088         }
1089
1090         if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1091             (!c->sb.clean ||
1092              fsck_err(c, inode_i_sectors_dirty_but_clean,
1093                       "filesystem marked clean, but inode %llu has i_sectors dirty",
1094                       u.bi_inum))) {
1095                 s64 sectors;
1096
1097                 bch_verbose(c, "recounting sectors for inode %llu",
1098                             u.bi_inum);
1099
1100                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1101                 if (sectors < 0) {
1102                         bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1103                         return sectors;
1104                 }
1105
1106                 u.bi_sectors = sectors;
1107                 u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1108                 do_update = true;
1109         }
1110
1111         if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1112                 u.bi_dir = 0;
1113                 u.bi_dir_offset = 0;
1114                 u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1115                 do_update = true;
1116         }
1117
1118         if (u.bi_dir || u.bi_dir_offset) {
1119                 ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1120                 if (ret)
1121                         goto err;
1122         }
1123
1124         if (fsck_err_on(u.bi_parent_subvol &&
1125                         (u.bi_subvol == 0 ||
1126                          u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1127                         c, inode_bi_parent_nonzero,
1128                         "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1129                         u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1130                 u.bi_parent_subvol = 0;
1131                 do_update = true;
1132         }
1133
1134         if (u.bi_subvol) {
1135                 struct bch_subvolume s;
1136
1137                 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1138                 if (ret && !bch2_err_matches(ret, ENOENT))
1139                         goto err;
1140
1141                 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1142                         ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1143                         goto do_update;
1144                 }
1145
1146                 if (fsck_err_on(ret,
1147                                 c, inode_bi_subvol_missing,
1148                                 "inode %llu:%u bi_subvol points to missing subvolume %u",
1149                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1150                     fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1151                                 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1152                                                            k.k->p.snapshot),
1153                                 c, inode_bi_subvol_wrong,
1154                                 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1155                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1156                                 le64_to_cpu(s.inode),
1157                                 le32_to_cpu(s.snapshot))) {
1158                         u.bi_subvol = 0;
1159                         u.bi_parent_subvol = 0;
1160                         do_update = true;
1161                 }
1162         }
1163 do_update:
1164         if (do_update) {
1165                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1166                 bch_err_msg(c, ret, "in fsck updating inode");
1167                 if (ret)
1168                         return ret;
1169         }
1170 err:
1171 fsck_err:
1172         bch_err_fn(c, ret);
1173         return ret;
1174 }
1175
1176 int bch2_check_inodes(struct bch_fs *c)
1177 {
1178         bool full = c->opts.fsck;
1179         struct bch_inode_unpacked prev = { 0 };
1180         struct snapshots_seen s;
1181
1182         snapshots_seen_init(&s);
1183
1184         int ret = bch2_trans_run(c,
1185                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1186                                 POS_MIN,
1187                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1188                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1189                         check_inode(trans, &iter, k, &prev, &s, full)));
1190
1191         snapshots_seen_exit(&s);
1192         bch_err_fn(c, ret);
1193         return ret;
1194 }
1195
1196 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1197 {
1198         struct bch_fs *c = trans->c;
1199         int ret = 0;
1200         s64 count2;
1201
1202         darray_for_each(w->inodes, i) {
1203                 if (i->inode.bi_sectors == i->count)
1204                         continue;
1205
1206                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1207
1208                 if (w->recalculate_sums)
1209                         i->count = count2;
1210
1211                 if (i->count != count2) {
1212                         bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1213                                             w->last_pos.inode, i->snapshot, i->count, count2);
1214                         return -BCH_ERR_internal_fsck_err;
1215                 }
1216
1217                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1218                                 c, inode_i_sectors_wrong,
1219                                 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1220                                 w->last_pos.inode, i->snapshot,
1221                                 i->inode.bi_sectors, i->count)) {
1222                         i->inode.bi_sectors = i->count;
1223                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1224                         if (ret)
1225                                 break;
1226                 }
1227         }
1228 fsck_err:
1229         bch_err_fn(c, ret);
1230         return ret;
1231 }
1232
1233 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1234 {
1235         u32 restart_count = trans->restart_count;
1236         return check_i_sectors_notnested(trans, w) ?:
1237                 trans_was_restarted(trans, restart_count);
1238 }
1239
1240 struct extent_end {
1241         u32                     snapshot;
1242         u64                     offset;
1243         struct snapshots_seen   seen;
1244 };
1245
1246 struct extent_ends {
1247         struct bpos                     last_pos;
1248         DARRAY(struct extent_end)       e;
1249 };
1250
1251 static void extent_ends_reset(struct extent_ends *extent_ends)
1252 {
1253         darray_for_each(extent_ends->e, i)
1254                 snapshots_seen_exit(&i->seen);
1255         extent_ends->e.nr = 0;
1256 }
1257
1258 static void extent_ends_exit(struct extent_ends *extent_ends)
1259 {
1260         extent_ends_reset(extent_ends);
1261         darray_exit(&extent_ends->e);
1262 }
1263
1264 static void extent_ends_init(struct extent_ends *extent_ends)
1265 {
1266         memset(extent_ends, 0, sizeof(*extent_ends));
1267 }
1268
1269 static int extent_ends_at(struct bch_fs *c,
1270                           struct extent_ends *extent_ends,
1271                           struct snapshots_seen *seen,
1272                           struct bkey_s_c k)
1273 {
1274         struct extent_end *i, n = (struct extent_end) {
1275                 .offset         = k.k->p.offset,
1276                 .snapshot       = k.k->p.snapshot,
1277                 .seen           = *seen,
1278         };
1279
1280         n.seen.ids.data = kmemdup(seen->ids.data,
1281                               sizeof(seen->ids.data[0]) * seen->ids.size,
1282                               GFP_KERNEL);
1283         if (!n.seen.ids.data)
1284                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1285
1286         __darray_for_each(extent_ends->e, i) {
1287                 if (i->snapshot == k.k->p.snapshot) {
1288                         snapshots_seen_exit(&i->seen);
1289                         *i = n;
1290                         return 0;
1291                 }
1292
1293                 if (i->snapshot >= k.k->p.snapshot)
1294                         break;
1295         }
1296
1297         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1298 }
1299
1300 static int overlapping_extents_found(struct btree_trans *trans,
1301                                      enum btree_id btree,
1302                                      struct bpos pos1, struct snapshots_seen *pos1_seen,
1303                                      struct bkey pos2,
1304                                      bool *fixed,
1305                                      struct extent_end *extent_end)
1306 {
1307         struct bch_fs *c = trans->c;
1308         struct printbuf buf = PRINTBUF;
1309         struct btree_iter iter1, iter2 = { NULL };
1310         struct bkey_s_c k1, k2;
1311         int ret;
1312
1313         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1314
1315         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1316                              BTREE_ITER_all_snapshots|
1317                              BTREE_ITER_not_extents);
1318         k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1319         ret = bkey_err(k1);
1320         if (ret)
1321                 goto err;
1322
1323         prt_str(&buf, "\n  ");
1324         bch2_bkey_val_to_text(&buf, c, k1);
1325
1326         if (!bpos_eq(pos1, k1.k->p)) {
1327                 prt_str(&buf, "\n  wanted\n  ");
1328                 bch2_bpos_to_text(&buf, pos1);
1329                 prt_str(&buf, "\n  ");
1330                 bch2_bkey_to_text(&buf, &pos2);
1331
1332                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1333                         __func__, buf.buf);
1334                 ret = -BCH_ERR_internal_fsck_err;
1335                 goto err;
1336         }
1337
1338         bch2_trans_copy_iter(&iter2, &iter1);
1339
1340         while (1) {
1341                 bch2_btree_iter_advance(&iter2);
1342
1343                 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1344                 ret = bkey_err(k2);
1345                 if (ret)
1346                         goto err;
1347
1348                 if (bpos_ge(k2.k->p, pos2.p))
1349                         break;
1350         }
1351
1352         prt_str(&buf, "\n  ");
1353         bch2_bkey_val_to_text(&buf, c, k2);
1354
1355         if (bpos_gt(k2.k->p, pos2.p) ||
1356             pos2.size != k2.k->size) {
1357                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1358                         __func__, buf.buf);
1359                 ret = -BCH_ERR_internal_fsck_err;
1360                 goto err;
1361         }
1362
1363         prt_printf(&buf, "\n  overwriting %s extent",
1364                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1365
1366         if (fsck_err(c, extent_overlapping,
1367                      "overlapping extents%s", buf.buf)) {
1368                 struct btree_iter *old_iter = &iter1;
1369                 struct disk_reservation res = { 0 };
1370
1371                 if (pos1.snapshot < pos2.p.snapshot) {
1372                         old_iter = &iter2;
1373                         swap(k1, k2);
1374                 }
1375
1376                 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1377
1378                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1379                                 BTREE_UPDATE_internal_snapshot_node,
1380                                 k1, k2) ?:
1381                         bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1382                 bch2_disk_reservation_put(c, &res);
1383
1384                 if (ret)
1385                         goto err;
1386
1387                 *fixed = true;
1388
1389                 if (pos1.snapshot == pos2.p.snapshot) {
1390                         /*
1391                          * We overwrote the first extent, and did the overwrite
1392                          * in the same snapshot:
1393                          */
1394                         extent_end->offset = bkey_start_offset(&pos2);
1395                 } else if (pos1.snapshot > pos2.p.snapshot) {
1396                         /*
1397                          * We overwrote the first extent in pos2's snapshot:
1398                          */
1399                         ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1400                 } else {
1401                         /*
1402                          * We overwrote the second extent - restart
1403                          * check_extent() from the top:
1404                          */
1405                         ret = -BCH_ERR_transaction_restart_nested;
1406                 }
1407         }
1408 fsck_err:
1409 err:
1410         bch2_trans_iter_exit(trans, &iter2);
1411         bch2_trans_iter_exit(trans, &iter1);
1412         printbuf_exit(&buf);
1413         return ret;
1414 }
1415
1416 static int check_overlapping_extents(struct btree_trans *trans,
1417                               struct snapshots_seen *seen,
1418                               struct extent_ends *extent_ends,
1419                               struct bkey_s_c k,
1420                               struct btree_iter *iter,
1421                               bool *fixed)
1422 {
1423         struct bch_fs *c = trans->c;
1424         int ret = 0;
1425
1426         /* transaction restart, running again */
1427         if (bpos_eq(extent_ends->last_pos, k.k->p))
1428                 return 0;
1429
1430         if (extent_ends->last_pos.inode != k.k->p.inode)
1431                 extent_ends_reset(extent_ends);
1432
1433         darray_for_each(extent_ends->e, i) {
1434                 if (i->offset <= bkey_start_offset(k.k))
1435                         continue;
1436
1437                 if (!ref_visible2(c,
1438                                   k.k->p.snapshot, seen,
1439                                   i->snapshot, &i->seen))
1440                         continue;
1441
1442                 ret = overlapping_extents_found(trans, iter->btree_id,
1443                                                 SPOS(iter->pos.inode,
1444                                                      i->offset,
1445                                                      i->snapshot),
1446                                                 &i->seen,
1447                                                 *k.k, fixed, i);
1448                 if (ret)
1449                         goto err;
1450         }
1451
1452         extent_ends->last_pos = k.k->p;
1453 err:
1454         return ret;
1455 }
1456
1457 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1458                                 struct bkey_s_c k)
1459 {
1460         struct bch_fs *c = trans->c;
1461         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1462         struct bch_extent_crc_unpacked crc;
1463         const union bch_extent_entry *i;
1464         unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1465
1466         bkey_for_each_crc(k.k, ptrs, crc, i)
1467                 if (crc_is_encoded(crc) &&
1468                     crc.uncompressed_size > encoded_extent_max_sectors) {
1469                         struct printbuf buf = PRINTBUF;
1470
1471                         bch2_bkey_val_to_text(&buf, c, k);
1472                         bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1473                         printbuf_exit(&buf);
1474                 }
1475
1476         return 0;
1477 }
1478
1479 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1480                         struct bkey_s_c k,
1481                         struct inode_walker *inode,
1482                         struct snapshots_seen *s,
1483                         struct extent_ends *extent_ends)
1484 {
1485         struct bch_fs *c = trans->c;
1486         struct inode_walker_entry *i;
1487         struct printbuf buf = PRINTBUF;
1488         int ret = 0;
1489
1490         ret = check_key_has_snapshot(trans, iter, k);
1491         if (ret) {
1492                 ret = ret < 0 ? ret : 0;
1493                 goto out;
1494         }
1495
1496         if (inode->last_pos.inode != k.k->p.inode) {
1497                 ret = check_i_sectors(trans, inode);
1498                 if (ret)
1499                         goto err;
1500         }
1501
1502         i = walk_inode(trans, inode, k);
1503         ret = PTR_ERR_OR_ZERO(i);
1504         if (ret)
1505                 goto err;
1506
1507         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1508         if (ret)
1509                 goto err;
1510
1511         if (k.k->type != KEY_TYPE_whiteout) {
1512                 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1513                         ret =   reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
1514                                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1515                         if (ret)
1516                                 goto err;
1517
1518                         inode->last_pos.inode--;
1519                         ret = -BCH_ERR_transaction_restart_nested;
1520                         goto err;
1521                 }
1522
1523                 if (fsck_err_on(!i, c, extent_in_missing_inode,
1524                                 "extent in missing inode:\n  %s",
1525                                 (printbuf_reset(&buf),
1526                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1527                         goto delete;
1528
1529                 if (fsck_err_on(i &&
1530                                 !S_ISREG(i->inode.bi_mode) &&
1531                                 !S_ISLNK(i->inode.bi_mode),
1532                                 c, extent_in_non_reg_inode,
1533                                 "extent in non regular inode mode %o:\n  %s",
1534                                 i->inode.bi_mode,
1535                                 (printbuf_reset(&buf),
1536                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1537                         goto delete;
1538
1539                 ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1540                                                 &inode->recalculate_sums);
1541                 if (ret)
1542                         goto err;
1543         }
1544
1545         /*
1546          * Check inodes in reverse order, from oldest snapshots to newest,
1547          * starting from the inode that matches this extent's snapshot. If we
1548          * didn't have one, iterate over all inodes:
1549          */
1550         if (!i)
1551                 i = inode->inodes.data + inode->inodes.nr - 1;
1552
1553         for (;
1554              inode->inodes.data && i >= inode->inodes.data;
1555              --i) {
1556                 if (i->snapshot > k.k->p.snapshot ||
1557                     !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1558                         continue;
1559
1560                 if (k.k->type != KEY_TYPE_whiteout) {
1561                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1562                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1563                                         !bkey_extent_is_reservation(k),
1564                                         c, extent_past_end_of_inode,
1565                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1566                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1567                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1568                                 struct btree_iter iter2;
1569
1570                                 bch2_trans_copy_iter(&iter2, iter);
1571                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1572                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1573                                         bch2_btree_delete_at(trans, &iter2,
1574                                                 BTREE_UPDATE_internal_snapshot_node);
1575                                 bch2_trans_iter_exit(trans, &iter2);
1576                                 if (ret)
1577                                         goto err;
1578
1579                                 iter->k.type = KEY_TYPE_whiteout;
1580                         }
1581
1582                         if (bkey_extent_is_allocation(k.k))
1583                                 i->count += k.k->size;
1584                 }
1585
1586                 i->seen_this_pos = true;
1587         }
1588
1589         if (k.k->type != KEY_TYPE_whiteout) {
1590                 ret = extent_ends_at(c, extent_ends, s, k);
1591                 if (ret)
1592                         goto err;
1593         }
1594 out:
1595 err:
1596 fsck_err:
1597         printbuf_exit(&buf);
1598         bch_err_fn(c, ret);
1599         return ret;
1600 delete:
1601         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1602         goto out;
1603 }
1604
1605 /*
1606  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1607  * that i_size an i_sectors are consistent
1608  */
1609 int bch2_check_extents(struct bch_fs *c)
1610 {
1611         struct inode_walker w = inode_walker_init();
1612         struct snapshots_seen s;
1613         struct extent_ends extent_ends;
1614         struct disk_reservation res = { 0 };
1615
1616         snapshots_seen_init(&s);
1617         extent_ends_init(&extent_ends);
1618
1619         int ret = bch2_trans_run(c,
1620                 for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1621                                 POS(BCACHEFS_ROOT_INO, 0),
1622                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1623                                 &res, NULL,
1624                                 BCH_TRANS_COMMIT_no_enospc, ({
1625                         bch2_disk_reservation_put(c, &res);
1626                         check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1627                         check_extent_overbig(trans, &iter, k);
1628                 })) ?:
1629                 check_i_sectors_notnested(trans, &w));
1630
1631         bch2_disk_reservation_put(c, &res);
1632         extent_ends_exit(&extent_ends);
1633         inode_walker_exit(&w);
1634         snapshots_seen_exit(&s);
1635
1636         bch_err_fn(c, ret);
1637         return ret;
1638 }
1639
1640 int bch2_check_indirect_extents(struct bch_fs *c)
1641 {
1642         struct disk_reservation res = { 0 };
1643
1644         int ret = bch2_trans_run(c,
1645                 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1646                                 POS_MIN,
1647                                 BTREE_ITER_prefetch, k,
1648                                 &res, NULL,
1649                                 BCH_TRANS_COMMIT_no_enospc, ({
1650                         bch2_disk_reservation_put(c, &res);
1651                         check_extent_overbig(trans, &iter, k);
1652                 })));
1653
1654         bch2_disk_reservation_put(c, &res);
1655         bch_err_fn(c, ret);
1656         return ret;
1657 }
1658
1659 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1660 {
1661         struct bch_fs *c = trans->c;
1662         int ret = 0;
1663         s64 count2;
1664
1665         darray_for_each(w->inodes, i) {
1666                 if (i->inode.bi_nlink == i->count)
1667                         continue;
1668
1669                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1670                 if (count2 < 0)
1671                         return count2;
1672
1673                 if (i->count != count2) {
1674                         bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1675                                             w->last_pos.inode, i->snapshot, i->count, count2);
1676                         i->count = count2;
1677                         if (i->inode.bi_nlink == i->count)
1678                                 continue;
1679                 }
1680
1681                 if (fsck_err_on(i->inode.bi_nlink != i->count,
1682                                 c, inode_dir_wrong_nlink,
1683                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1684                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1685                         i->inode.bi_nlink = i->count;
1686                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1687                         if (ret)
1688                                 break;
1689                 }
1690         }
1691 fsck_err:
1692         bch_err_fn(c, ret);
1693         return ret;
1694 }
1695
1696 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1697 {
1698         u32 restart_count = trans->restart_count;
1699         return check_subdir_count_notnested(trans, w) ?:
1700                 trans_was_restarted(trans, restart_count);
1701 }
1702
1703 static int check_dirent_inode_dirent(struct btree_trans *trans,
1704                                    struct btree_iter *iter,
1705                                    struct bkey_s_c_dirent d,
1706                                    struct bch_inode_unpacked *target,
1707                                    u32 target_snapshot)
1708 {
1709         struct bch_fs *c = trans->c;
1710         struct printbuf buf = PRINTBUF;
1711         int ret = 0;
1712
1713         if (inode_points_to_dirent(target, d))
1714                 return 0;
1715
1716         if (bch2_inode_should_have_bp(target) &&
1717             !fsck_err(c, inode_wrong_backpointer,
1718                       "dirent points to inode that does not point back:\n  %s",
1719                       (bch2_bkey_val_to_text(&buf, c, d.s_c),
1720                        prt_printf(&buf, "\n  "),
1721                        bch2_inode_unpacked_to_text(&buf, target),
1722                        buf.buf)))
1723                 goto out_noiter;
1724
1725         if (!target->bi_dir &&
1726             !target->bi_dir_offset) {
1727                 target->bi_dir          = d.k->p.inode;
1728                 target->bi_dir_offset   = d.k->p.offset;
1729                 return __bch2_fsck_write_inode(trans, target, target_snapshot);
1730         }
1731
1732         struct btree_iter bp_iter = { NULL };
1733         struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1734                               SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1735         ret = bkey_err(bp_dirent);
1736         if (ret && !bch2_err_matches(ret, ENOENT))
1737                 goto err;
1738
1739         bool backpointer_exists = !ret;
1740         ret = 0;
1741
1742         if (fsck_err_on(!backpointer_exists,
1743                         c, inode_wrong_backpointer,
1744                         "inode %llu:%u has wrong backpointer:\n"
1745                         "got       %llu:%llu\n"
1746                         "should be %llu:%llu",
1747                         target->bi_inum, target_snapshot,
1748                         target->bi_dir,
1749                         target->bi_dir_offset,
1750                         d.k->p.inode,
1751                         d.k->p.offset)) {
1752                 target->bi_dir          = d.k->p.inode;
1753                 target->bi_dir_offset   = d.k->p.offset;
1754                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1755                 goto out;
1756         }
1757
1758         bch2_bkey_val_to_text(&buf, c, d.s_c);
1759         prt_newline(&buf);
1760         if (backpointer_exists)
1761                 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1762
1763         if (fsck_err_on(backpointer_exists &&
1764                         (S_ISDIR(target->bi_mode) ||
1765                          target->bi_subvol),
1766                         c, inode_dir_multiple_links,
1767                         "%s %llu:%u with multiple links\n%s",
1768                         S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1769                         target->bi_inum, target_snapshot, buf.buf)) {
1770                 ret = __remove_dirent(trans, d.k->p);
1771                 goto out;
1772         }
1773
1774         /*
1775          * hardlinked file with nlink 0:
1776          * We're just adjusting nlink here so check_nlinks() will pick
1777          * it up, it ignores inodes with nlink 0
1778          */
1779         if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1780                         c, inode_multiple_links_but_nlink_0,
1781                         "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1782                         target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1783                 target->bi_nlink++;
1784                 target->bi_flags &= ~BCH_INODE_unlinked;
1785                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1786                 if (ret)
1787                         goto err;
1788         }
1789 out:
1790 err:
1791 fsck_err:
1792         bch2_trans_iter_exit(trans, &bp_iter);
1793 out_noiter:
1794         printbuf_exit(&buf);
1795         bch_err_fn(c, ret);
1796         return ret;
1797 }
1798
1799 static int check_dirent_target(struct btree_trans *trans,
1800                                struct btree_iter *iter,
1801                                struct bkey_s_c_dirent d,
1802                                struct bch_inode_unpacked *target,
1803                                u32 target_snapshot)
1804 {
1805         struct bch_fs *c = trans->c;
1806         struct bkey_i_dirent *n;
1807         struct printbuf buf = PRINTBUF;
1808         int ret = 0;
1809
1810         ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1811         if (ret)
1812                 goto err;
1813
1814         if (fsck_err_on(d.v->d_type != inode_d_type(target),
1815                         c, dirent_d_type_wrong,
1816                         "incorrect d_type: got %s, should be %s:\n%s",
1817                         bch2_d_type_str(d.v->d_type),
1818                         bch2_d_type_str(inode_d_type(target)),
1819                         (printbuf_reset(&buf),
1820                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1821                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1822                 ret = PTR_ERR_OR_ZERO(n);
1823                 if (ret)
1824                         goto err;
1825
1826                 bkey_reassemble(&n->k_i, d.s_c);
1827                 n->v.d_type = inode_d_type(target);
1828                 if (n->v.d_type == DT_SUBVOL) {
1829                         n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1830                         n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1831                 } else {
1832                         n->v.d_inum = cpu_to_le64(target->bi_inum);
1833                 }
1834
1835                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1836                 if (ret)
1837                         goto err;
1838
1839                 d = dirent_i_to_s_c(n);
1840         }
1841 err:
1842 fsck_err:
1843         printbuf_exit(&buf);
1844         bch_err_fn(c, ret);
1845         return ret;
1846 }
1847
1848 /* find a subvolume that's a descendent of @snapshot: */
1849 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1850 {
1851         struct btree_iter iter;
1852         struct bkey_s_c k;
1853         int ret;
1854
1855         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1856                 if (k.k->type != KEY_TYPE_subvolume)
1857                         continue;
1858
1859                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1860                 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1861                         bch2_trans_iter_exit(trans, &iter);
1862                         *subvolid = k.k->p.offset;
1863                         goto found;
1864                 }
1865         }
1866         if (!ret)
1867                 ret = -ENOENT;
1868 found:
1869         bch2_trans_iter_exit(trans, &iter);
1870         return ret;
1871 }
1872
1873 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1874                                   struct bkey_s_c_dirent d)
1875 {
1876         struct bch_fs *c = trans->c;
1877         struct btree_iter subvol_iter = {};
1878         struct bch_inode_unpacked subvol_root;
1879         u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1880         u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1881         u32 parent_snapshot;
1882         u32 new_parent_subvol = 0;
1883         u64 parent_inum;
1884         struct printbuf buf = PRINTBUF;
1885         int ret = 0;
1886
1887         ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1888         if (ret && !bch2_err_matches(ret, ENOENT))
1889                 return ret;
1890
1891         if (ret ||
1892             (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1893                 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1894                 if (ret2 && !bch2_err_matches(ret, ENOENT))
1895                         return ret2;
1896         }
1897
1898         if (ret &&
1899             !new_parent_subvol &&
1900             (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1901                 /*
1902                  * Couldn't find a subvol for dirent's snapshot - but we lost
1903                  * subvols, so we need to reconstruct:
1904                  */
1905                 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1906                 if (ret)
1907                         return ret;
1908
1909                 parent_snapshot = d.k->p.snapshot;
1910         }
1911
1912         if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1913                         "dirent parent_subvol points to missing subvolume\n%s",
1914                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1915             fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1916                         c, dirent_not_visible_in_parent_subvol,
1917                         "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1918                         parent_snapshot,
1919                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1920                 if (!new_parent_subvol) {
1921                         bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1922                         return -BCH_ERR_fsck_repair_unimplemented;
1923                 }
1924
1925                 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1926                 ret = PTR_ERR_OR_ZERO(new_dirent);
1927                 if (ret)
1928                         goto err;
1929
1930                 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1931         }
1932
1933         struct bkey_s_c_subvolume s =
1934                 bch2_bkey_get_iter_typed(trans, &subvol_iter,
1935                                          BTREE_ID_subvolumes, POS(0, target_subvol),
1936                                          0, subvolume);
1937         ret = bkey_err(s.s_c);
1938         if (ret && !bch2_err_matches(ret, ENOENT))
1939                 return ret;
1940
1941         if (ret) {
1942                 if (fsck_err(c, dirent_to_missing_subvol,
1943                              "dirent points to missing subvolume\n%s",
1944                              (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1945                         return __remove_dirent(trans, d.k->p);
1946                 ret = 0;
1947                 goto out;
1948         }
1949
1950         if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1951                         c, subvol_fs_path_parent_wrong,
1952                         "subvol with wrong fs_path_parent, should be be %u\n%s",
1953                         parent_subvol,
1954                         (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1955                 struct bkey_i_subvolume *n =
1956                         bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1957                 ret = PTR_ERR_OR_ZERO(n);
1958                 if (ret)
1959                         goto err;
1960
1961                 n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1962         }
1963
1964         u64 target_inum = le64_to_cpu(s.v->inode);
1965         u32 target_snapshot = le32_to_cpu(s.v->snapshot);
1966
1967         ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
1968         if (ret && !bch2_err_matches(ret, ENOENT))
1969                 goto err;
1970
1971         if (ret) {
1972                 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
1973                 ret = -BCH_ERR_fsck_repair_unimplemented;
1974                 ret = 0;
1975                 goto err;
1976         }
1977
1978         if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
1979                         c, inode_bi_parent_wrong,
1980                         "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
1981                         target_inum,
1982                         subvol_root.bi_parent_subvol, parent_subvol)) {
1983                 subvol_root.bi_parent_subvol = parent_subvol;
1984                 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
1985                 if (ret)
1986                         goto err;
1987         }
1988
1989         ret = check_dirent_target(trans, iter, d, &subvol_root,
1990                                   target_snapshot);
1991         if (ret)
1992                 goto err;
1993 out:
1994 err:
1995 fsck_err:
1996         bch2_trans_iter_exit(trans, &subvol_iter);
1997         printbuf_exit(&buf);
1998         return ret;
1999 }
2000
2001 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2002                         struct bkey_s_c k,
2003                         struct bch_hash_info *hash_info,
2004                         struct inode_walker *dir,
2005                         struct inode_walker *target,
2006                         struct snapshots_seen *s)
2007 {
2008         struct bch_fs *c = trans->c;
2009         struct inode_walker_entry *i;
2010         struct printbuf buf = PRINTBUF;
2011         int ret = 0;
2012
2013         ret = check_key_has_snapshot(trans, iter, k);
2014         if (ret) {
2015                 ret = ret < 0 ? ret : 0;
2016                 goto out;
2017         }
2018
2019         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2020         if (ret)
2021                 goto err;
2022
2023         if (k.k->type == KEY_TYPE_whiteout)
2024                 goto out;
2025
2026         if (dir->last_pos.inode != k.k->p.inode) {
2027                 ret = check_subdir_count(trans, dir);
2028                 if (ret)
2029                         goto err;
2030         }
2031
2032         BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
2033
2034         i = walk_inode(trans, dir, k);
2035         ret = PTR_ERR_OR_ZERO(i);
2036         if (ret < 0)
2037                 goto err;
2038
2039         if (dir->first_this_inode && dir->inodes.nr)
2040                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
2041         dir->first_this_inode = false;
2042
2043         if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
2044                 ret =   reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
2045                         bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2046                 if (ret)
2047                         goto err;
2048
2049                 dir->last_pos.inode--;
2050                 ret = -BCH_ERR_transaction_restart_nested;
2051                 goto err;
2052         }
2053
2054         if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
2055                         "dirent in nonexisting directory:\n%s",
2056                         (printbuf_reset(&buf),
2057                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2058                 ret = bch2_btree_delete_at(trans, iter,
2059                                 BTREE_UPDATE_internal_snapshot_node);
2060                 goto out;
2061         }
2062
2063         if (!i)
2064                 goto out;
2065
2066         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
2067                         c, dirent_in_non_dir_inode,
2068                         "dirent in non directory inode type %s:\n%s",
2069                         bch2_d_type_str(inode_d_type(&i->inode)),
2070                         (printbuf_reset(&buf),
2071                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2072                 ret = bch2_btree_delete_at(trans, iter, 0);
2073                 goto out;
2074         }
2075
2076         ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2077         if (ret < 0)
2078                 goto err;
2079         if (ret) {
2080                 /* dirent has been deleted */
2081                 ret = 0;
2082                 goto out;
2083         }
2084
2085         if (k.k->type != KEY_TYPE_dirent)
2086                 goto out;
2087
2088         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2089
2090         if (d.v->d_type == DT_SUBVOL) {
2091                 ret = check_dirent_to_subvol(trans, iter, d);
2092                 if (ret)
2093                         goto err;
2094         } else {
2095                 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2096                 if (ret)
2097                         goto err;
2098
2099                 if (fsck_err_on(!target->inodes.nr,
2100                                 c, dirent_to_missing_inode,
2101                                 "dirent points to missing inode:\n%s",
2102                                 (printbuf_reset(&buf),
2103                                  bch2_bkey_val_to_text(&buf, c, k),
2104                                  buf.buf))) {
2105                         ret = __remove_dirent(trans, d.k->p);
2106                         if (ret)
2107                                 goto err;
2108                 }
2109
2110                 darray_for_each(target->inodes, i) {
2111                         ret = check_dirent_target(trans, iter, d,
2112                                                   &i->inode, i->snapshot);
2113                         if (ret)
2114                                 goto err;
2115                 }
2116
2117                 if (d.v->d_type == DT_DIR)
2118                         for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2119                                 i->count++;
2120         }
2121 out:
2122 err:
2123 fsck_err:
2124         printbuf_exit(&buf);
2125         bch_err_fn(c, ret);
2126         return ret;
2127 }
2128
2129 /*
2130  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2131  * validate d_type
2132  */
2133 int bch2_check_dirents(struct bch_fs *c)
2134 {
2135         struct inode_walker dir = inode_walker_init();
2136         struct inode_walker target = inode_walker_init();
2137         struct snapshots_seen s;
2138         struct bch_hash_info hash_info;
2139
2140         snapshots_seen_init(&s);
2141
2142         int ret = bch2_trans_run(c,
2143                 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2144                                 POS(BCACHEFS_ROOT_INO, 0),
2145                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2146                                 k,
2147                                 NULL, NULL,
2148                                 BCH_TRANS_COMMIT_no_enospc,
2149                         check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2150                 check_subdir_count_notnested(trans, &dir));
2151
2152         snapshots_seen_exit(&s);
2153         inode_walker_exit(&dir);
2154         inode_walker_exit(&target);
2155         bch_err_fn(c, ret);
2156         return ret;
2157 }
2158
2159 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2160                        struct bkey_s_c k,
2161                        struct bch_hash_info *hash_info,
2162                        struct inode_walker *inode)
2163 {
2164         struct bch_fs *c = trans->c;
2165         struct inode_walker_entry *i;
2166         int ret;
2167
2168         ret = check_key_has_snapshot(trans, iter, k);
2169         if (ret < 0)
2170                 return ret;
2171         if (ret)
2172                 return 0;
2173
2174         i = walk_inode(trans, inode, k);
2175         ret = PTR_ERR_OR_ZERO(i);
2176         if (ret)
2177                 return ret;
2178
2179         if (inode->first_this_inode && inode->inodes.nr)
2180                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
2181         inode->first_this_inode = false;
2182
2183         if (fsck_err_on(!i, c, xattr_in_missing_inode,
2184                         "xattr for missing inode %llu",
2185                         k.k->p.inode))
2186                 return bch2_btree_delete_at(trans, iter, 0);
2187
2188         if (!i)
2189                 return 0;
2190
2191         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2192 fsck_err:
2193         bch_err_fn(c, ret);
2194         return ret;
2195 }
2196
2197 /*
2198  * Walk xattrs: verify that they all have a corresponding inode
2199  */
2200 int bch2_check_xattrs(struct bch_fs *c)
2201 {
2202         struct inode_walker inode = inode_walker_init();
2203         struct bch_hash_info hash_info;
2204         int ret = 0;
2205
2206         ret = bch2_trans_run(c,
2207                 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2208                         POS(BCACHEFS_ROOT_INO, 0),
2209                         BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2210                         k,
2211                         NULL, NULL,
2212                         BCH_TRANS_COMMIT_no_enospc,
2213                 check_xattr(trans, &iter, k, &hash_info, &inode)));
2214         bch_err_fn(c, ret);
2215         return ret;
2216 }
2217
2218 static int check_root_trans(struct btree_trans *trans)
2219 {
2220         struct bch_fs *c = trans->c;
2221         struct bch_inode_unpacked root_inode;
2222         u32 snapshot;
2223         u64 inum;
2224         int ret;
2225
2226         ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2227         if (ret && !bch2_err_matches(ret, ENOENT))
2228                 return ret;
2229
2230         if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2231                                 "root subvol missing")) {
2232                 struct bkey_i_subvolume *root_subvol =
2233                         bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2234                 ret = PTR_ERR_OR_ZERO(root_subvol);
2235                 if (ret)
2236                         goto err;
2237
2238                 snapshot        = U32_MAX;
2239                 inum            = BCACHEFS_ROOT_INO;
2240
2241                 bkey_subvolume_init(&root_subvol->k_i);
2242                 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2243                 root_subvol->v.flags    = 0;
2244                 root_subvol->v.snapshot = cpu_to_le32(snapshot);
2245                 root_subvol->v.inode    = cpu_to_le64(inum);
2246                 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2247                 bch_err_msg(c, ret, "writing root subvol");
2248                 if (ret)
2249                         goto err;
2250         }
2251
2252         ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2253         if (ret && !bch2_err_matches(ret, ENOENT))
2254                 return ret;
2255
2256         if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2257                                 "root directory missing") ||
2258             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2259                                 c, root_inode_not_dir,
2260                                 "root inode not a directory")) {
2261                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2262                                 0, NULL);
2263                 root_inode.bi_inum = inum;
2264
2265                 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2266                 bch_err_msg(c, ret, "writing root inode");
2267         }
2268 err:
2269 fsck_err:
2270         return ret;
2271 }
2272
2273 /* Get root directory, create if it doesn't exist: */
2274 int bch2_check_root(struct bch_fs *c)
2275 {
2276         int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2277                 check_root_trans(trans));
2278         bch_err_fn(c, ret);
2279         return ret;
2280 }
2281
2282 typedef DARRAY(u32) darray_u32;
2283
2284 static bool darray_u32_has(darray_u32 *d, u32 v)
2285 {
2286         darray_for_each(*d, i)
2287                 if (*i == v)
2288                         return true;
2289         return false;
2290 }
2291
2292 /*
2293  * We've checked that inode backpointers point to valid dirents; here, it's
2294  * sufficient to check that the subvolume root has a dirent:
2295  */
2296 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2297 {
2298         struct bch_inode_unpacked inode;
2299         int ret = bch2_inode_find_by_inum_trans(trans,
2300                                 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2301                                 &inode);
2302         if (ret)
2303                 return ret;
2304
2305         return inode.bi_dir != 0;
2306 }
2307
2308 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2309 {
2310         struct bch_fs *c = trans->c;
2311         struct btree_iter parent_iter = {};
2312         darray_u32 subvol_path = {};
2313         struct printbuf buf = PRINTBUF;
2314         int ret = 0;
2315
2316         if (k.k->type != KEY_TYPE_subvolume)
2317                 return 0;
2318
2319         while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2320                 ret = darray_push(&subvol_path, k.k->p.offset);
2321                 if (ret)
2322                         goto err;
2323
2324                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2325
2326                 ret = subvol_has_dirent(trans, s);
2327                 if (ret < 0)
2328                         break;
2329
2330                 if (fsck_err_on(!ret,
2331                                 c, subvol_unreachable,
2332                                 "unreachable subvolume %s",
2333                                 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2334                                  buf.buf))) {
2335                         ret = reattach_subvol(trans, s);
2336                         break;
2337                 }
2338
2339                 u32 parent = le32_to_cpu(s.v->fs_path_parent);
2340
2341                 if (darray_u32_has(&subvol_path, parent)) {
2342                         if (fsck_err(c, subvol_loop, "subvolume loop"))
2343                                 ret = reattach_subvol(trans, s);
2344                         break;
2345                 }
2346
2347                 bch2_trans_iter_exit(trans, &parent_iter);
2348                 bch2_trans_iter_init(trans, &parent_iter,
2349                                      BTREE_ID_subvolumes, POS(0, parent), 0);
2350                 k = bch2_btree_iter_peek_slot(&parent_iter);
2351                 ret = bkey_err(k);
2352                 if (ret)
2353                         goto err;
2354
2355                 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2356                                 c, subvol_unreachable,
2357                                 "unreachable subvolume %s",
2358                                 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2359                                  buf.buf))) {
2360                         ret = reattach_subvol(trans, s);
2361                         break;
2362                 }
2363         }
2364 fsck_err:
2365 err:
2366         printbuf_exit(&buf);
2367         darray_exit(&subvol_path);
2368         bch2_trans_iter_exit(trans, &parent_iter);
2369         return ret;
2370 }
2371
2372 int bch2_check_subvolume_structure(struct bch_fs *c)
2373 {
2374         int ret = bch2_trans_run(c,
2375                 for_each_btree_key_commit(trans, iter,
2376                                 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2377                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2378                         check_subvol_path(trans, &iter, k)));
2379         bch_err_fn(c, ret);
2380         return ret;
2381 }
2382
2383 struct pathbuf_entry {
2384         u64     inum;
2385         u32     snapshot;
2386 };
2387
2388 typedef DARRAY(struct pathbuf_entry) pathbuf;
2389
2390 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2391 {
2392         darray_for_each(*p, i)
2393                 if (i->inum     == inum &&
2394                     i->snapshot == snapshot)
2395                         return true;
2396         return false;
2397 }
2398
2399 /*
2400  * Check that a given inode is reachable from its subvolume root - we already
2401  * verified subvolume connectivity:
2402  *
2403  * XXX: we should also be verifying that inodes are in the right subvolumes
2404  */
2405 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2406 {
2407         struct bch_fs *c = trans->c;
2408         struct btree_iter inode_iter = {};
2409         struct bch_inode_unpacked inode;
2410         struct printbuf buf = PRINTBUF;
2411         u32 snapshot = inode_k.k->p.snapshot;
2412         int ret = 0;
2413
2414         p->nr = 0;
2415
2416         BUG_ON(bch2_inode_unpack(inode_k, &inode));
2417
2418         while (!inode.bi_subvol) {
2419                 struct btree_iter dirent_iter;
2420                 struct bkey_s_c_dirent d;
2421                 u32 parent_snapshot = snapshot;
2422
2423                 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2424                 ret = bkey_err(d.s_c);
2425                 if (ret && !bch2_err_matches(ret, ENOENT))
2426                         break;
2427
2428                 if (!ret && !dirent_points_to_inode(d, &inode)) {
2429                         bch2_trans_iter_exit(trans, &dirent_iter);
2430                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2431                 }
2432
2433                 if (bch2_err_matches(ret, ENOENT)) {
2434                         ret = 0;
2435                         if (fsck_err(c, inode_unreachable,
2436                                      "unreachable inode\n%s",
2437                                      (printbuf_reset(&buf),
2438                                       bch2_bkey_val_to_text(&buf, c, inode_k),
2439                                       buf.buf)))
2440                                 ret = reattach_inode(trans, &inode, snapshot);
2441                         goto out;
2442                 }
2443
2444                 bch2_trans_iter_exit(trans, &dirent_iter);
2445
2446                 if (!S_ISDIR(inode.bi_mode))
2447                         break;
2448
2449                 ret = darray_push(p, ((struct pathbuf_entry) {
2450                         .inum           = inode.bi_inum,
2451                         .snapshot       = snapshot,
2452                 }));
2453                 if (ret)
2454                         return ret;
2455
2456                 snapshot = parent_snapshot;
2457
2458                 bch2_trans_iter_exit(trans, &inode_iter);
2459                 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2460                                              SPOS(0, inode.bi_dir, snapshot), 0);
2461                 ret = bkey_err(inode_k) ?:
2462                         !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2463                         : bch2_inode_unpack(inode_k, &inode);
2464                 if (ret) {
2465                         /* Should have been caught in dirents pass */
2466                         if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2467                                 bch_err(c, "error looking up parent directory: %i", ret);
2468                         break;
2469                 }
2470
2471                 snapshot = inode_k.k->p.snapshot;
2472
2473                 if (path_is_dup(p, inode.bi_inum, snapshot)) {
2474                         /* XXX print path */
2475                         bch_err(c, "directory structure loop");
2476
2477                         darray_for_each(*p, i)
2478                                 pr_err("%llu:%u", i->inum, i->snapshot);
2479                         pr_err("%llu:%u", inode.bi_inum, snapshot);
2480
2481                         if (fsck_err(c, dir_loop, "directory structure loop")) {
2482                                 ret = remove_backpointer(trans, &inode);
2483                                 bch_err_msg(c, ret, "removing dirent");
2484                                 if (ret)
2485                                         break;
2486
2487                                 ret = reattach_inode(trans, &inode, snapshot);
2488                                 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2489                         }
2490                         break;
2491                 }
2492         }
2493 out:
2494 fsck_err:
2495         bch2_trans_iter_exit(trans, &inode_iter);
2496         printbuf_exit(&buf);
2497         bch_err_fn(c, ret);
2498         return ret;
2499 }
2500
2501 /*
2502  * Check for unreachable inodes, as well as loops in the directory structure:
2503  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2504  * unreachable:
2505  */
2506 int bch2_check_directory_structure(struct bch_fs *c)
2507 {
2508         pathbuf path = { 0, };
2509         int ret;
2510
2511         ret = bch2_trans_run(c,
2512                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2513                                           BTREE_ITER_intent|
2514                                           BTREE_ITER_prefetch|
2515                                           BTREE_ITER_all_snapshots, k,
2516                                           NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2517                         if (!bkey_is_inode(k.k))
2518                                 continue;
2519
2520                         if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2521                                 continue;
2522
2523                         check_path(trans, &path, k);
2524                 })));
2525         darray_exit(&path);
2526
2527         bch_err_fn(c, ret);
2528         return ret;
2529 }
2530
2531 struct nlink_table {
2532         size_t          nr;
2533         size_t          size;
2534
2535         struct nlink {
2536                 u64     inum;
2537                 u32     snapshot;
2538                 u32     count;
2539         }               *d;
2540 };
2541
2542 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2543                      u64 inum, u32 snapshot)
2544 {
2545         if (t->nr == t->size) {
2546                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2547                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2548
2549                 if (!d) {
2550                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2551                                 new_size);
2552                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2553                 }
2554
2555                 if (t->d)
2556                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2557                 kvfree(t->d);
2558
2559                 t->d = d;
2560                 t->size = new_size;
2561         }
2562
2563
2564         t->d[t->nr++] = (struct nlink) {
2565                 .inum           = inum,
2566                 .snapshot       = snapshot,
2567         };
2568
2569         return 0;
2570 }
2571
2572 static int nlink_cmp(const void *_l, const void *_r)
2573 {
2574         const struct nlink *l = _l;
2575         const struct nlink *r = _r;
2576
2577         return cmp_int(l->inum, r->inum);
2578 }
2579
2580 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2581                      struct nlink_table *links,
2582                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2583 {
2584         struct nlink *link, key = {
2585                 .inum = inum, .snapshot = U32_MAX,
2586         };
2587
2588         if (inum < range_start || inum >= range_end)
2589                 return;
2590
2591         link = __inline_bsearch(&key, links->d, links->nr,
2592                                 sizeof(links->d[0]), nlink_cmp);
2593         if (!link)
2594                 return;
2595
2596         while (link > links->d && link[0].inum == link[-1].inum)
2597                 --link;
2598
2599         for (; link < links->d + links->nr && link->inum == inum; link++)
2600                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2601                         link->count++;
2602                         if (link->snapshot >= snapshot)
2603                                 break;
2604                 }
2605 }
2606
2607 noinline_for_stack
2608 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2609                                        struct nlink_table *t,
2610                                        u64 start, u64 *end)
2611 {
2612         int ret = bch2_trans_run(c,
2613                 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2614                                    POS(0, start),
2615                                    BTREE_ITER_intent|
2616                                    BTREE_ITER_prefetch|
2617                                    BTREE_ITER_all_snapshots, k, ({
2618                         if (!bkey_is_inode(k.k))
2619                                 continue;
2620
2621                         /* Should never fail, checked by bch2_inode_invalid: */
2622                         struct bch_inode_unpacked u;
2623                         BUG_ON(bch2_inode_unpack(k, &u));
2624
2625                         /*
2626                          * Backpointer and directory structure checks are sufficient for
2627                          * directories, since they can't have hardlinks:
2628                          */
2629                         if (S_ISDIR(u.bi_mode))
2630                                 continue;
2631
2632                         if (!u.bi_nlink)
2633                                 continue;
2634
2635                         ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2636                         if (ret) {
2637                                 *end = k.k->p.offset;
2638                                 ret = 0;
2639                                 break;
2640                         }
2641                         0;
2642                 })));
2643
2644         bch_err_fn(c, ret);
2645         return ret;
2646 }
2647
2648 noinline_for_stack
2649 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2650                                      u64 range_start, u64 range_end)
2651 {
2652         struct snapshots_seen s;
2653
2654         snapshots_seen_init(&s);
2655
2656         int ret = bch2_trans_run(c,
2657                 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2658                                    BTREE_ITER_intent|
2659                                    BTREE_ITER_prefetch|
2660                                    BTREE_ITER_all_snapshots, k, ({
2661                         ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2662                         if (ret)
2663                                 break;
2664
2665                         if (k.k->type == KEY_TYPE_dirent) {
2666                                 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2667
2668                                 if (d.v->d_type != DT_DIR &&
2669                                     d.v->d_type != DT_SUBVOL)
2670                                         inc_link(c, &s, links, range_start, range_end,
2671                                                  le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2672                         }
2673                         0;
2674                 })));
2675
2676         snapshots_seen_exit(&s);
2677
2678         bch_err_fn(c, ret);
2679         return ret;
2680 }
2681
2682 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2683                                      struct bkey_s_c k,
2684                                      struct nlink_table *links,
2685                                      size_t *idx, u64 range_end)
2686 {
2687         struct bch_fs *c = trans->c;
2688         struct bch_inode_unpacked u;
2689         struct nlink *link = &links->d[*idx];
2690         int ret = 0;
2691
2692         if (k.k->p.offset >= range_end)
2693                 return 1;
2694
2695         if (!bkey_is_inode(k.k))
2696                 return 0;
2697
2698         BUG_ON(bch2_inode_unpack(k, &u));
2699
2700         if (S_ISDIR(u.bi_mode))
2701                 return 0;
2702
2703         if (!u.bi_nlink)
2704                 return 0;
2705
2706         while ((cmp_int(link->inum, k.k->p.offset) ?:
2707                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2708                 BUG_ON(*idx == links->nr);
2709                 link = &links->d[++*idx];
2710         }
2711
2712         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2713                         c, inode_wrong_nlink,
2714                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2715                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2716                         bch2_inode_nlink_get(&u), link->count)) {
2717                 bch2_inode_nlink_set(&u, link->count);
2718                 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2719         }
2720 fsck_err:
2721         return ret;
2722 }
2723
2724 noinline_for_stack
2725 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2726                                struct nlink_table *links,
2727                                u64 range_start, u64 range_end)
2728 {
2729         size_t idx = 0;
2730
2731         int ret = bch2_trans_run(c,
2732                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2733                                 POS(0, range_start),
2734                                 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2735                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2736                         check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2737         if (ret < 0) {
2738                 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2739                 return ret;
2740         }
2741
2742         return 0;
2743 }
2744
2745 int bch2_check_nlinks(struct bch_fs *c)
2746 {
2747         struct nlink_table links = { 0 };
2748         u64 this_iter_range_start, next_iter_range_start = 0;
2749         int ret = 0;
2750
2751         do {
2752                 this_iter_range_start = next_iter_range_start;
2753                 next_iter_range_start = U64_MAX;
2754
2755                 ret = check_nlinks_find_hardlinks(c, &links,
2756                                                   this_iter_range_start,
2757                                                   &next_iter_range_start);
2758
2759                 ret = check_nlinks_walk_dirents(c, &links,
2760                                           this_iter_range_start,
2761                                           next_iter_range_start);
2762                 if (ret)
2763                         break;
2764
2765                 ret = check_nlinks_update_hardlinks(c, &links,
2766                                          this_iter_range_start,
2767                                          next_iter_range_start);
2768                 if (ret)
2769                         break;
2770
2771                 links.nr = 0;
2772         } while (next_iter_range_start != U64_MAX);
2773
2774         kvfree(links.d);
2775         bch_err_fn(c, ret);
2776         return ret;
2777 }
2778
2779 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2780                              struct bkey_s_c k)
2781 {
2782         struct bkey_s_c_reflink_p p;
2783         struct bkey_i_reflink_p *u;
2784
2785         if (k.k->type != KEY_TYPE_reflink_p)
2786                 return 0;
2787
2788         p = bkey_s_c_to_reflink_p(k);
2789
2790         if (!p.v->front_pad && !p.v->back_pad)
2791                 return 0;
2792
2793         u = bch2_trans_kmalloc(trans, sizeof(*u));
2794         int ret = PTR_ERR_OR_ZERO(u);
2795         if (ret)
2796                 return ret;
2797
2798         bkey_reassemble(&u->k_i, k);
2799         u->v.front_pad  = 0;
2800         u->v.back_pad   = 0;
2801
2802         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2803 }
2804
2805 int bch2_fix_reflink_p(struct bch_fs *c)
2806 {
2807         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2808                 return 0;
2809
2810         int ret = bch2_trans_run(c,
2811                 for_each_btree_key_commit(trans, iter,
2812                                 BTREE_ID_extents, POS_MIN,
2813                                 BTREE_ITER_intent|BTREE_ITER_prefetch|
2814                                 BTREE_ITER_all_snapshots, k,
2815                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2816                         fix_reflink_p_key(trans, &iter, k)));
2817         bch_err_fn(c, ret);
2818         return ret;
2819 }
This page took 0.19583 seconds and 4 git commands to generate.