]> Git Repo - linux.git/blob - fs/bcachefs/fsck.c
Linux 6.14-rc3
[linux.git] / fs / bcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fs-common.h"
14 #include "fsck.h"
15 #include "inode.h"
16 #include "keylist.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25
26 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
27                                    struct bkey_s_c_dirent d)
28 {
29         return  inode->bi_dir           == d.k->p.inode &&
30                 inode->bi_dir_offset    == d.k->p.offset;
31 }
32
33 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
34                                          struct bch_inode_unpacked *inode)
35 {
36         if (d.v->d_type == DT_SUBVOL
37             ? le32_to_cpu(d.v->d_child_subvol)  == inode->bi_subvol
38             : le64_to_cpu(d.v->d_inum)          == inode->bi_inum)
39                 return 0;
40         return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
41 }
42
43 static void dirent_inode_mismatch_msg(struct printbuf *out,
44                                       struct bch_fs *c,
45                                       struct bkey_s_c_dirent dirent,
46                                       struct bch_inode_unpacked *inode)
47 {
48         prt_str(out, "inode points to dirent that does not point back:");
49         prt_newline(out);
50         bch2_bkey_val_to_text(out, c, dirent.s_c);
51         prt_newline(out);
52         bch2_inode_unpacked_to_text(out, inode);
53 }
54
55 static int dirent_points_to_inode(struct bch_fs *c,
56                                   struct bkey_s_c_dirent dirent,
57                                   struct bch_inode_unpacked *inode)
58 {
59         int ret = dirent_points_to_inode_nowarn(dirent, inode);
60         if (ret) {
61                 struct printbuf buf = PRINTBUF;
62                 dirent_inode_mismatch_msg(&buf, c, dirent, inode);
63                 bch_warn(c, "%s", buf.buf);
64                 printbuf_exit(&buf);
65         }
66         return ret;
67 }
68
69 /*
70  * XXX: this is handling transaction restarts without returning
71  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
72  */
73 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
74                                     u32 snapshot)
75 {
76         u64 sectors = 0;
77
78         int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
79                                 SPOS(inum, 0, snapshot),
80                                 POS(inum, U64_MAX),
81                                 0, k, ({
82                 if (bkey_extent_is_allocation(k.k))
83                         sectors += k.k->size;
84                 0;
85         }));
86
87         return ret ?: sectors;
88 }
89
90 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
91                                     u32 snapshot)
92 {
93         u64 subdirs = 0;
94
95         int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
96                                     SPOS(inum, 0, snapshot),
97                                     POS(inum, U64_MAX),
98                                     0, k, ({
99                 if (k.k->type == KEY_TYPE_dirent &&
100                     bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
101                         subdirs++;
102                 0;
103         }));
104
105         return ret ?: subdirs;
106 }
107
108 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
109                          u32 *snapshot, u64 *inum)
110 {
111         struct bch_subvolume s;
112         int ret = bch2_subvolume_get(trans, subvol, false, &s);
113
114         *snapshot = le32_to_cpu(s.snapshot);
115         *inum = le64_to_cpu(s.inode);
116         return ret;
117 }
118
119 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
120                               struct bch_inode_unpacked *inode)
121 {
122         struct btree_iter iter;
123         struct bkey_s_c k;
124         int ret;
125
126         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
127                                      BTREE_ITER_all_snapshots, k, ret) {
128                 if (k.k->p.offset != inode_nr)
129                         break;
130                 if (!bkey_is_inode(k.k))
131                         continue;
132                 ret = bch2_inode_unpack(k, inode);
133                 goto found;
134         }
135         ret = -BCH_ERR_ENOENT_inode;
136 found:
137         bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
138         bch2_trans_iter_exit(trans, &iter);
139         return ret;
140 }
141
142 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
143                         struct bch_inode_unpacked *inode)
144 {
145         struct btree_iter iter;
146         struct bkey_s_c k;
147         int ret;
148
149         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
150                                SPOS(0, inode_nr, snapshot), 0);
151         ret = bkey_err(k);
152         if (ret)
153                 goto err;
154
155         ret = bkey_is_inode(k.k)
156                 ? bch2_inode_unpack(k, inode)
157                 : -BCH_ERR_ENOENT_inode;
158 err:
159         bch2_trans_iter_exit(trans, &iter);
160         return ret;
161 }
162
163 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
164                            struct bch_hash_info hash_info,
165                            subvol_inum dir, struct qstr *name,
166                            u64 *target, unsigned *type, u32 snapshot)
167 {
168         struct btree_iter iter;
169         struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
170                                                          &hash_info, dir, name, 0, snapshot);
171         int ret = bkey_err(k);
172         if (ret)
173                 return ret;
174
175         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
176         *target = le64_to_cpu(d.v->d_inum);
177         *type = d.v->d_type;
178         bch2_trans_iter_exit(trans, &iter);
179         return 0;
180 }
181
182 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
183 {
184         struct bch_fs *c = trans->c;
185         struct btree_iter iter;
186         struct bch_inode_unpacked dir_inode;
187         struct bch_hash_info dir_hash_info;
188         int ret;
189
190         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
191         if (ret)
192                 goto err;
193
194         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
195
196         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
197
198         ret =   bch2_btree_iter_traverse(&iter) ?:
199                 bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
200                                     &dir_hash_info, &iter,
201                                     BTREE_UPDATE_internal_snapshot_node);
202         bch2_trans_iter_exit(trans, &iter);
203 err:
204         bch_err_fn(c, ret);
205         return ret;
206 }
207
208 /*
209  * Find any subvolume associated with a tree of snapshots
210  * We can't rely on master_subvol - it might have been deleted.
211  */
212 static int find_snapshot_tree_subvol(struct btree_trans *trans,
213                                      u32 tree_id, u32 *subvol)
214 {
215         struct btree_iter iter;
216         struct bkey_s_c k;
217         int ret;
218
219         for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
220                 if (k.k->type != KEY_TYPE_snapshot)
221                         continue;
222
223                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
224                 if (le32_to_cpu(s.v->tree) != tree_id)
225                         continue;
226
227                 if (s.v->subvol) {
228                         *subvol = le32_to_cpu(s.v->subvol);
229                         goto found;
230                 }
231         }
232         ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
233 found:
234         bch2_trans_iter_exit(trans, &iter);
235         return ret;
236 }
237
238 /* Get lost+found, create if it doesn't exist: */
239 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
240                             struct bch_inode_unpacked *lostfound,
241                             u64 reattaching_inum)
242 {
243         struct bch_fs *c = trans->c;
244         struct qstr lostfound_str = QSTR("lost+found");
245         struct btree_iter lostfound_iter = { NULL };
246         u64 inum = 0;
247         unsigned d_type = 0;
248         int ret;
249
250         struct bch_snapshot_tree st;
251         ret = bch2_snapshot_tree_lookup(trans,
252                         bch2_snapshot_tree(c, snapshot), &st);
253         if (ret)
254                 return ret;
255
256         u32 subvolid;
257         ret = find_snapshot_tree_subvol(trans,
258                                 bch2_snapshot_tree(c, snapshot), &subvolid);
259         bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
260                     bch2_snapshot_tree(c, snapshot));
261         if (ret)
262                 return ret;
263
264         struct bch_subvolume subvol;
265         ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
266         bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
267         if (ret)
268                 return ret;
269
270         if (!subvol.inode) {
271                 struct btree_iter iter;
272                 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
273                                 BTREE_ID_subvolumes, POS(0, subvolid),
274                                 0, subvolume);
275                 ret = PTR_ERR_OR_ZERO(subvol);
276                 if (ret)
277                         return ret;
278
279                 subvol->v.inode = cpu_to_le64(reattaching_inum);
280                 bch2_trans_iter_exit(trans, &iter);
281         }
282
283         subvol_inum root_inum = {
284                 .subvol = subvolid,
285                 .inum = le64_to_cpu(subvol.inode)
286         };
287
288         struct bch_inode_unpacked root_inode;
289         struct bch_hash_info root_hash_info;
290         ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
291         bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
292                     root_inum.inum, subvolid);
293         if (ret)
294                 return ret;
295
296         root_hash_info = bch2_hash_info_init(c, &root_inode);
297
298         ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
299                               &lostfound_str, &inum, &d_type, snapshot);
300         if (bch2_err_matches(ret, ENOENT))
301                 goto create_lostfound;
302
303         bch_err_fn(c, ret);
304         if (ret)
305                 return ret;
306
307         if (d_type != DT_DIR) {
308                 bch_err(c, "error looking up lost+found: not a directory");
309                 return -BCH_ERR_ENOENT_not_directory;
310         }
311
312         /*
313          * The bch2_check_dirents pass has already run, dangling dirents
314          * shouldn't exist here:
315          */
316         ret = lookup_inode(trans, inum, snapshot, lostfound);
317         bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
318                     inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
319         return ret;
320
321 create_lostfound:
322         /*
323          * we always create lost+found in the root snapshot; we don't want
324          * different branches of the snapshot tree to have different lost+found
325          */
326         snapshot = le32_to_cpu(st.root_snapshot);
327         /*
328          * XXX: we could have a nicer log message here  if we had a nice way to
329          * walk backpointers to print a path
330          */
331         struct printbuf path = PRINTBUF;
332         ret = bch2_inum_to_path(trans, root_inum, &path);
333         if (ret)
334                 goto err;
335
336         bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
337                    path.buf, root_inum.subvol, snapshot);
338         printbuf_exit(&path);
339
340         u64 now = bch2_current_time(c);
341         u64 cpu = raw_smp_processor_id();
342
343         bch2_inode_init_early(c, lostfound);
344         bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
345         lostfound->bi_dir = root_inode.bi_inum;
346         lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
347
348         root_inode.bi_nlink++;
349
350         ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
351         if (ret)
352                 goto err;
353
354         bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
355         ret = bch2_btree_iter_traverse(&lostfound_iter);
356         if (ret)
357                 goto err;
358
359         ret =   bch2_dirent_create_snapshot(trans,
360                                 0, root_inode.bi_inum, snapshot, &root_hash_info,
361                                 mode_to_type(lostfound->bi_mode),
362                                 &lostfound_str,
363                                 lostfound->bi_inum,
364                                 &lostfound->bi_dir_offset,
365                                 STR_HASH_must_create) ?:
366                 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
367                                        BTREE_UPDATE_internal_snapshot_node);
368 err:
369         bch_err_msg(c, ret, "creating lost+found");
370         bch2_trans_iter_exit(trans, &lostfound_iter);
371         return ret;
372 }
373
374 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
375 {
376         if (inode->bi_inum == BCACHEFS_ROOT_INO &&
377             inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
378                 return false;
379
380         return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
381 }
382
383 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
384 {
385         struct btree_iter iter;
386         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
387                                         SPOS(d_pos.inode, d_pos.offset, snapshot),
388                                         BTREE_ITER_intent|
389                                         BTREE_ITER_with_updates);
390         int ret = bkey_err(k);
391         if (ret)
392                 return ret;
393
394         if (bpos_eq(k.k->p, d_pos)) {
395                 /*
396                  * delet_at() doesn't work because the update path doesn't
397                  * internally use BTREE_ITER_with_updates yet
398                  */
399                 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
400                 ret = PTR_ERR_OR_ZERO(k);
401                 if (ret)
402                         goto err;
403
404                 bkey_init(&k->k);
405                 k->k.type = KEY_TYPE_whiteout;
406                 k->k.p = iter.pos;
407                 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
408         }
409 err:
410         bch2_trans_iter_exit(trans, &iter);
411         return ret;
412 }
413
414 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
415 {
416         struct bch_fs *c = trans->c;
417         struct bch_inode_unpacked lostfound;
418         char name_buf[20];
419         int ret;
420
421         u32 dirent_snapshot = inode->bi_snapshot;
422         if (inode->bi_subvol) {
423                 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
424
425                 u64 root_inum;
426                 ret = subvol_lookup(trans, inode->bi_parent_subvol,
427                                     &dirent_snapshot, &root_inum);
428                 if (ret)
429                         return ret;
430
431                 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
432         } else {
433                 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
434         }
435
436         ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
437         if (ret)
438                 return ret;
439
440         lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
441
442         /* ensure lost+found inode is also present in inode snapshot */
443         if (!inode->bi_subvol) {
444                 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
445                 lostfound.bi_snapshot = inode->bi_snapshot;
446         }
447
448         ret = __bch2_fsck_write_inode(trans, &lostfound);
449         if (ret)
450                 return ret;
451
452         struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
453         struct qstr name = QSTR(name_buf);
454
455         inode->bi_dir = lostfound.bi_inum;
456
457         ret = bch2_dirent_create_snapshot(trans,
458                                 inode->bi_parent_subvol, lostfound.bi_inum,
459                                 dirent_snapshot,
460                                 &dir_hash,
461                                 inode_d_type(inode),
462                                 &name,
463                                 inode->bi_subvol ?: inode->bi_inum,
464                                 &inode->bi_dir_offset,
465                                 STR_HASH_must_create);
466         if (ret) {
467                 bch_err_msg(c, ret, "error creating dirent");
468                 return ret;
469         }
470
471         ret = __bch2_fsck_write_inode(trans, inode);
472         if (ret)
473                 return ret;
474
475         /*
476          * Fix up inodes in child snapshots: if they should also be reattached
477          * update the backpointer field, if they should not be we need to emit
478          * whiteouts for the dirent we just created.
479          */
480         if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
481                 snapshot_id_list whiteouts_done;
482                 struct btree_iter iter;
483                 struct bkey_s_c k;
484
485                 darray_init(&whiteouts_done);
486
487                 for_each_btree_key_reverse_norestart(trans, iter,
488                                 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
489                                 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
490                         if (k.k->p.offset != inode->bi_inum)
491                                 break;
492
493                         if (!bkey_is_inode(k.k) ||
494                             !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
495                             snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
496                                 continue;
497
498                         struct bch_inode_unpacked child_inode;
499                         ret = bch2_inode_unpack(k, &child_inode);
500                         if (ret)
501                                 break;
502
503                         if (!inode_should_reattach(&child_inode)) {
504                                 ret = maybe_delete_dirent(trans,
505                                                           SPOS(lostfound.bi_inum, inode->bi_dir_offset,
506                                                                dirent_snapshot),
507                                                           k.k->p.snapshot);
508                                 if (ret)
509                                         break;
510
511                                 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
512                                 if (ret)
513                                         break;
514                         } else {
515                                 iter.snapshot = k.k->p.snapshot;
516                                 child_inode.bi_dir = inode->bi_dir;
517                                 child_inode.bi_dir_offset = inode->bi_dir_offset;
518
519                                 ret = bch2_inode_write_flags(trans, &iter, &child_inode,
520                                                              BTREE_UPDATE_internal_snapshot_node);
521                                 if (ret)
522                                         break;
523                         }
524                 }
525                 darray_exit(&whiteouts_done);
526                 bch2_trans_iter_exit(trans, &iter);
527         }
528
529         return ret;
530 }
531
532 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
533                                                 struct btree_iter *iter,
534                                                 struct bpos pos)
535 {
536         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
537 }
538
539 static int remove_backpointer(struct btree_trans *trans,
540                               struct bch_inode_unpacked *inode)
541 {
542         if (!inode->bi_dir)
543                 return 0;
544
545         struct bch_fs *c = trans->c;
546         struct btree_iter iter;
547         struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
548                                      SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
549         int ret = bkey_err(d) ?:
550                   dirent_points_to_inode(c, d, inode) ?:
551                   __remove_dirent(trans, d.k->p);
552         bch2_trans_iter_exit(trans, &iter);
553         return ret;
554 }
555
556 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
557 {
558         struct bch_fs *c = trans->c;
559
560         struct bch_inode_unpacked inode;
561         int ret = bch2_inode_find_by_inum_trans(trans,
562                                 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
563                                 &inode);
564         if (ret)
565                 return ret;
566
567         ret = remove_backpointer(trans, &inode);
568         if (!bch2_err_matches(ret, ENOENT))
569                 bch_err_msg(c, ret, "removing dirent");
570         if (ret)
571                 return ret;
572
573         ret = reattach_inode(trans, &inode);
574         bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
575         return ret;
576 }
577
578 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
579 {
580         struct bch_fs *c = trans->c;
581
582         if (!bch2_snapshot_is_leaf(c, snapshotid)) {
583                 bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
584                 return -BCH_ERR_fsck_repair_unimplemented;
585         }
586
587         /*
588          * If inum isn't set, that means we're being called from check_dirents,
589          * not check_inodes - the root of this subvolume doesn't exist or we
590          * would have found it there:
591          */
592         if (!inum) {
593                 struct btree_iter inode_iter = {};
594                 struct bch_inode_unpacked new_inode;
595                 u64 cpu = raw_smp_processor_id();
596
597                 bch2_inode_init_early(c, &new_inode);
598                 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
599
600                 new_inode.bi_subvol = subvolid;
601
602                 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
603                           bch2_btree_iter_traverse(&inode_iter) ?:
604                           bch2_inode_write(trans, &inode_iter, &new_inode);
605                 bch2_trans_iter_exit(trans, &inode_iter);
606                 if (ret)
607                         return ret;
608
609                 inum = new_inode.bi_inum;
610         }
611
612         bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
613
614         struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
615         int ret = PTR_ERR_OR_ZERO(new_subvol);
616         if (ret)
617                 return ret;
618
619         bkey_subvolume_init(&new_subvol->k_i);
620         new_subvol->k.p.offset  = subvolid;
621         new_subvol->v.snapshot  = cpu_to_le32(snapshotid);
622         new_subvol->v.inode     = cpu_to_le64(inum);
623         ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
624         if (ret)
625                 return ret;
626
627         struct btree_iter iter;
628         struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
629                         BTREE_ID_snapshots, POS(0, snapshotid),
630                         0, snapshot);
631         ret = PTR_ERR_OR_ZERO(s);
632         bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
633         if (ret)
634                 return ret;
635
636         u32 snapshot_tree = le32_to_cpu(s->v.tree);
637
638         s->v.subvol = cpu_to_le32(subvolid);
639         SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
640         bch2_trans_iter_exit(trans, &iter);
641
642         struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
643                         BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
644                         0, snapshot_tree);
645         ret = PTR_ERR_OR_ZERO(st);
646         bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
647         if (ret)
648                 return ret;
649
650         if (!st->v.master_subvol)
651                 st->v.master_subvol = cpu_to_le32(subvolid);
652
653         bch2_trans_iter_exit(trans, &iter);
654         return 0;
655 }
656
657 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
658 {
659         struct bch_fs *c = trans->c;
660         unsigned i_mode = S_IFREG;
661         u64 i_size = 0;
662
663         switch (btree) {
664         case BTREE_ID_extents: {
665                 struct btree_iter iter = {};
666
667                 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
668                 struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
669                 bch2_trans_iter_exit(trans, &iter);
670                 int ret = bkey_err(k);
671                 if (ret)
672                         return ret;
673
674                 i_size = k.k->p.offset << 9;
675                 break;
676         }
677         case BTREE_ID_dirents:
678                 i_mode = S_IFDIR;
679                 break;
680         case BTREE_ID_xattrs:
681                 break;
682         default:
683                 BUG();
684         }
685
686         struct bch_inode_unpacked new_inode;
687         bch2_inode_init_early(c, &new_inode);
688         bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
689         new_inode.bi_size = i_size;
690         new_inode.bi_inum = inum;
691         new_inode.bi_snapshot = snapshot;
692
693         return __bch2_fsck_write_inode(trans, &new_inode);
694 }
695
696 struct snapshots_seen {
697         struct bpos                     pos;
698         snapshot_id_list                ids;
699 };
700
701 static inline void snapshots_seen_exit(struct snapshots_seen *s)
702 {
703         darray_exit(&s->ids);
704 }
705
706 static inline void snapshots_seen_init(struct snapshots_seen *s)
707 {
708         memset(s, 0, sizeof(*s));
709 }
710
711 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
712 {
713         u32 *i;
714         __darray_for_each(s->ids, i) {
715                 if (*i == id)
716                         return 0;
717                 if (*i > id)
718                         break;
719         }
720
721         int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
722         if (ret)
723                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
724                         s->ids.size);
725         return ret;
726 }
727
728 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
729                                  enum btree_id btree_id, struct bpos pos)
730 {
731         if (!bkey_eq(s->pos, pos))
732                 s->ids.nr = 0;
733         s->pos = pos;
734
735         return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
736 }
737
738 /**
739  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
740  * and @ancestor hasn't been overwritten in @seen
741  *
742  * @c:          filesystem handle
743  * @seen:       list of snapshot ids already seen at current position
744  * @id:         descendent snapshot id
745  * @ancestor:   ancestor snapshot id
746  *
747  * Returns:     whether key in @ancestor snapshot is visible in @id snapshot
748  */
749 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
750                                     u32 id, u32 ancestor)
751 {
752         ssize_t i;
753
754         EBUG_ON(id > ancestor);
755
756         /* @ancestor should be the snapshot most recently added to @seen */
757         EBUG_ON(ancestor != seen->pos.snapshot);
758         EBUG_ON(ancestor != darray_last(seen->ids));
759
760         if (id == ancestor)
761                 return true;
762
763         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
764                 return false;
765
766         /*
767          * We know that @id is a descendant of @ancestor, we're checking if
768          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
769          * @ascestor and with @id as a descendent.
770          *
771          * But we already know that we're scanning IDs between @id and @ancestor
772          * numerically, since snapshot ID lists are kept sorted, so if we find
773          * an id that's an ancestor of @id we're done:
774          */
775
776         for (i = seen->ids.nr - 2;
777              i >= 0 && seen->ids.data[i] >= id;
778              --i)
779                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
780                         return false;
781
782         return true;
783 }
784
785 /**
786  * ref_visible - given a key with snapshot id @src that points to a key with
787  * snapshot id @dst, test whether there is some snapshot in which @dst is
788  * visible.
789  *
790  * @c:          filesystem handle
791  * @s:          list of snapshot IDs already seen at @src
792  * @src:        snapshot ID of src key
793  * @dst:        snapshot ID of dst key
794  * Returns:     true if there is some snapshot in which @dst is visible
795  *
796  * Assumes we're visiting @src keys in natural key order
797  */
798 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
799                         u32 src, u32 dst)
800 {
801         return dst <= src
802                 ? key_visible_in_snapshot(c, s, dst, src)
803                 : bch2_snapshot_is_ancestor(c, src, dst);
804 }
805
806 static int ref_visible2(struct bch_fs *c,
807                         u32 src, struct snapshots_seen *src_seen,
808                         u32 dst, struct snapshots_seen *dst_seen)
809 {
810         if (dst > src) {
811                 swap(dst, src);
812                 swap(dst_seen, src_seen);
813         }
814         return key_visible_in_snapshot(c, src_seen, dst, src);
815 }
816
817 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
818         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
819              (_i)->snapshot <= (_snapshot); _i++)                                       \
820                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
821
822 struct inode_walker_entry {
823         struct bch_inode_unpacked inode;
824         u32                     snapshot;
825         u64                     count;
826 };
827
828 struct inode_walker {
829         bool                            first_this_inode;
830         bool                            have_inodes;
831         bool                            recalculate_sums;
832         struct bpos                     last_pos;
833
834         DARRAY(struct inode_walker_entry) inodes;
835         snapshot_id_list                deletes;
836 };
837
838 static void inode_walker_exit(struct inode_walker *w)
839 {
840         darray_exit(&w->inodes);
841         darray_exit(&w->deletes);
842 }
843
844 static struct inode_walker inode_walker_init(void)
845 {
846         return (struct inode_walker) { 0, };
847 }
848
849 static int add_inode(struct bch_fs *c, struct inode_walker *w,
850                      struct bkey_s_c inode)
851 {
852         struct bch_inode_unpacked u;
853
854         return bch2_inode_unpack(inode, &u) ?:
855                 darray_push(&w->inodes, ((struct inode_walker_entry) {
856                 .inode          = u,
857                 .snapshot       = inode.k->p.snapshot,
858         }));
859 }
860
861 static int get_inodes_all_snapshots(struct btree_trans *trans,
862                                     struct inode_walker *w, u64 inum)
863 {
864         struct bch_fs *c = trans->c;
865         struct btree_iter iter;
866         struct bkey_s_c k;
867         int ret;
868
869         /*
870          * We no longer have inodes for w->last_pos; clear this to avoid
871          * screwing up check_i_sectors/check_subdir_count if we take a
872          * transaction restart here:
873          */
874         w->have_inodes = false;
875         w->recalculate_sums = false;
876         w->inodes.nr = 0;
877
878         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
879                                      BTREE_ITER_all_snapshots, k, ret) {
880                 if (k.k->p.offset != inum)
881                         break;
882
883                 if (bkey_is_inode(k.k))
884                         add_inode(c, w, k);
885         }
886         bch2_trans_iter_exit(trans, &iter);
887
888         if (ret)
889                 return ret;
890
891         w->first_this_inode = true;
892         w->have_inodes = true;
893         return 0;
894 }
895
896 static struct inode_walker_entry *
897 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
898 {
899         bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
900
901         struct inode_walker_entry *i;
902         __darray_for_each(w->inodes, i)
903                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
904                         goto found;
905
906         return NULL;
907 found:
908         BUG_ON(k.k->p.snapshot > i->snapshot);
909
910         if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
911                 struct inode_walker_entry new = *i;
912
913                 new.snapshot = k.k->p.snapshot;
914                 new.count = 0;
915
916                 struct printbuf buf = PRINTBUF;
917                 bch2_bkey_val_to_text(&buf, c, k);
918
919                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
920                          "unexpected because we should always update the inode when we update a key in that inode\n"
921                          "%s",
922                          w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
923                 printbuf_exit(&buf);
924
925                 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
926                         --i;
927
928                 size_t pos = i - w->inodes.data;
929                 int ret = darray_insert_item(&w->inodes, pos, new);
930                 if (ret)
931                         return ERR_PTR(ret);
932
933                 i = w->inodes.data + pos;
934         }
935
936         return i;
937 }
938
939 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
940                                              struct inode_walker *w,
941                                              struct bkey_s_c k)
942 {
943         if (w->last_pos.inode != k.k->p.inode) {
944                 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
945                 if (ret)
946                         return ERR_PTR(ret);
947         }
948
949         w->last_pos = k.k->p;
950
951         return lookup_inode_for_snapshot(trans->c, w, k);
952 }
953
954 static int get_visible_inodes(struct btree_trans *trans,
955                               struct inode_walker *w,
956                               struct snapshots_seen *s,
957                               u64 inum)
958 {
959         struct bch_fs *c = trans->c;
960         struct btree_iter iter;
961         struct bkey_s_c k;
962         int ret;
963
964         w->inodes.nr = 0;
965         w->deletes.nr = 0;
966
967         for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
968                            BTREE_ITER_all_snapshots, k, ret) {
969                 if (k.k->p.offset != inum)
970                         break;
971
972                 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
973                         continue;
974
975                 if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
976                         continue;
977
978                 ret = bkey_is_inode(k.k)
979                         ? add_inode(c, w, k)
980                         : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
981                 if (ret)
982                         break;
983         }
984         bch2_trans_iter_exit(trans, &iter);
985
986         return ret;
987 }
988
989 /*
990  * Prefer to delete the first one, since that will be the one at the wrong
991  * offset:
992  * return value: 0 -> delete k1, 1 -> delete k2
993  */
994 int bch2_fsck_update_backpointers(struct btree_trans *trans,
995                                   struct snapshots_seen *s,
996                                   const struct bch_hash_desc desc,
997                                   struct bch_hash_info *hash_info,
998                                   struct bkey_i *new)
999 {
1000         if (new->k.type != KEY_TYPE_dirent)
1001                 return 0;
1002
1003         struct bkey_i_dirent *d = bkey_i_to_dirent(new);
1004         struct inode_walker target = inode_walker_init();
1005         int ret = 0;
1006
1007         if (d->v.d_type == DT_SUBVOL) {
1008                 BUG();
1009         } else {
1010                 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
1011                 if (ret)
1012                         goto err;
1013
1014                 darray_for_each(target.inodes, i) {
1015                         i->inode.bi_dir_offset = d->k.p.offset;
1016                         ret = __bch2_fsck_write_inode(trans, &i->inode);
1017                         if (ret)
1018                                 goto err;
1019                 }
1020         }
1021 err:
1022         inode_walker_exit(&target);
1023         return ret;
1024 }
1025
1026 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
1027                                                struct btree_iter *iter,
1028                                                struct bch_inode_unpacked *inode,
1029                                                u32 *snapshot)
1030 {
1031         if (inode->bi_subvol) {
1032                 u64 inum;
1033                 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
1034                 if (ret)
1035                         return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
1036         }
1037
1038         return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
1039 }
1040
1041 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
1042 {
1043         struct btree_iter iter;
1044         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
1045         int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
1046         bch2_trans_iter_exit(trans, &iter);
1047         return ret;
1048 }
1049
1050 static int check_inode_dirent_inode(struct btree_trans *trans,
1051                                     struct bch_inode_unpacked *inode,
1052                                     bool *write_inode)
1053 {
1054         struct bch_fs *c = trans->c;
1055         struct printbuf buf = PRINTBUF;
1056
1057         u32 inode_snapshot = inode->bi_snapshot;
1058         struct btree_iter dirent_iter = {};
1059         struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1060         int ret = bkey_err(d);
1061         if (ret && !bch2_err_matches(ret, ENOENT))
1062                 return ret;
1063
1064         if (fsck_err_on(ret,
1065                         trans, inode_points_to_missing_dirent,
1066                         "inode points to missing dirent\n%s",
1067                         (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1068             fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
1069                         trans, inode_points_to_wrong_dirent,
1070                         "%s",
1071                         (printbuf_reset(&buf),
1072                          dirent_inode_mismatch_msg(&buf, c, d, inode),
1073                          buf.buf))) {
1074                 /*
1075                  * We just clear the backpointer fields for now. If we find a
1076                  * dirent that points to this inode in check_dirents(), we'll
1077                  * update it then; then when we get to check_path() if the
1078                  * backpointer is still 0 we'll reattach it.
1079                  */
1080                 inode->bi_dir = 0;
1081                 inode->bi_dir_offset = 0;
1082                 *write_inode = true;
1083         }
1084
1085         ret = 0;
1086 fsck_err:
1087         bch2_trans_iter_exit(trans, &dirent_iter);
1088         printbuf_exit(&buf);
1089         bch_err_fn(c, ret);
1090         return ret;
1091 }
1092
1093 static int get_snapshot_root_inode(struct btree_trans *trans,
1094                                    struct bch_inode_unpacked *root,
1095                                    u64 inum)
1096 {
1097         struct btree_iter iter;
1098         struct bkey_s_c k;
1099         int ret = 0;
1100
1101         for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
1102                                              SPOS(0, inum, U32_MAX),
1103                                              BTREE_ITER_all_snapshots, k, ret) {
1104                 if (k.k->p.offset != inum)
1105                         break;
1106                 if (bkey_is_inode(k.k))
1107                         goto found_root;
1108         }
1109         if (ret)
1110                 goto err;
1111         BUG();
1112 found_root:
1113         ret = bch2_inode_unpack(k, root);
1114 err:
1115         bch2_trans_iter_exit(trans, &iter);
1116         return ret;
1117 }
1118
1119 static int check_directory_size(struct btree_trans *trans,
1120                                 struct bch_inode_unpacked *inode_u,
1121                                 struct bkey_s_c inode_k, bool *write_inode)
1122 {
1123         struct btree_iter iter;
1124         struct bkey_s_c k;
1125         u64 new_size = 0;
1126         int ret;
1127
1128         for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
1129                         SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
1130                         POS(inode_k.k->p.offset, U64_MAX),
1131                         0, k, ret) {
1132                 if (k.k->type != KEY_TYPE_dirent)
1133                         continue;
1134
1135                 struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
1136                 struct qstr name = bch2_dirent_get_name(dirent);
1137
1138                 new_size += dirent_occupied_size(&name);
1139         }
1140         bch2_trans_iter_exit(trans, &iter);
1141
1142         if (!ret && inode_u->bi_size != new_size) {
1143                 inode_u->bi_size = new_size;
1144                 *write_inode = true;
1145         }
1146
1147         return ret;
1148 }
1149
1150 static int check_inode(struct btree_trans *trans,
1151                        struct btree_iter *iter,
1152                        struct bkey_s_c k,
1153                        struct bch_inode_unpacked *snapshot_root,
1154                        struct snapshots_seen *s)
1155 {
1156         struct bch_fs *c = trans->c;
1157         struct printbuf buf = PRINTBUF;
1158         struct bch_inode_unpacked u;
1159         bool do_update = false;
1160         int ret;
1161
1162         ret = bch2_check_key_has_snapshot(trans, iter, k);
1163         if (ret < 0)
1164                 goto err;
1165         if (ret)
1166                 return 0;
1167
1168         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1169         if (ret)
1170                 goto err;
1171
1172         if (!bkey_is_inode(k.k))
1173                 return 0;
1174
1175         ret = bch2_inode_unpack(k, &u);
1176         if (ret)
1177                 goto err;
1178
1179         if (snapshot_root->bi_inum != u.bi_inum) {
1180                 ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
1181                 if (ret)
1182                         goto err;
1183         }
1184
1185         if (fsck_err_on(u.bi_hash_seed          != snapshot_root->bi_hash_seed ||
1186                         INODE_STR_HASH(&u)      != INODE_STR_HASH(snapshot_root),
1187                         trans, inode_snapshot_mismatch,
1188                         "inode hash info in different snapshots don't match")) {
1189                 u.bi_hash_seed = snapshot_root->bi_hash_seed;
1190                 SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
1191                 do_update = true;
1192         }
1193
1194         if (u.bi_dir || u.bi_dir_offset) {
1195                 ret = check_inode_dirent_inode(trans, &u, &do_update);
1196                 if (ret)
1197                         goto err;
1198         }
1199
1200         if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
1201                         trans, inode_unlinked_but_has_dirent,
1202                         "inode unlinked but has dirent\n%s",
1203                         (printbuf_reset(&buf),
1204                          bch2_inode_unpacked_to_text(&buf, &u),
1205                          buf.buf))) {
1206                 u.bi_flags &= ~BCH_INODE_unlinked;
1207                 do_update = true;
1208         }
1209
1210         if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1211                 /* Check for this early so that check_unreachable_inode() will reattach it */
1212
1213                 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1214                 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1215                         goto err;
1216
1217                 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1218                             "dir unlinked but not empty\n%s",
1219                             (printbuf_reset(&buf),
1220                              bch2_inode_unpacked_to_text(&buf, &u),
1221                              buf.buf));
1222                 u.bi_flags &= ~BCH_INODE_unlinked;
1223                 do_update = true;
1224                 ret = 0;
1225         }
1226
1227         ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1228         if (ret < 0)
1229                 goto err;
1230
1231         if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1232                         trans, inode_has_child_snapshots_wrong,
1233                         "inode has_child_snapshots flag wrong (should be %u)\n%s",
1234                         ret,
1235                         (printbuf_reset(&buf),
1236                          bch2_inode_unpacked_to_text(&buf, &u),
1237                          buf.buf))) {
1238                 if (ret)
1239                         u.bi_flags |= BCH_INODE_has_child_snapshot;
1240                 else
1241                         u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1242                 do_update = true;
1243         }
1244         ret = 0;
1245
1246         if ((u.bi_flags & BCH_INODE_unlinked) &&
1247             !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1248                 if (!test_bit(BCH_FS_started, &c->flags)) {
1249                         /*
1250                          * If we're not in online fsck, don't delete unlinked
1251                          * inodes, just make sure they're on the deleted list.
1252                          *
1253                          * They might be referred to by a logged operation -
1254                          * i.e. we might have crashed in the middle of a
1255                          * truncate on an unlinked but open file - so we want to
1256                          * let the delete_dead_inodes kill it after resuming
1257                          * logged ops.
1258                          */
1259                         ret = check_inode_deleted_list(trans, k.k->p);
1260                         if (ret < 0)
1261                                 goto err_noprint;
1262
1263                         fsck_err_on(!ret,
1264                                     trans, unlinked_inode_not_on_deleted_list,
1265                                     "inode %llu:%u unlinked, but not on deleted list",
1266                                     u.bi_inum, k.k->p.snapshot);
1267
1268                         ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1269                         if (ret)
1270                                 goto err;
1271                 } else {
1272                         ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1273                         if (ret < 0)
1274                                 goto err;
1275
1276                         if (fsck_err_on(!ret,
1277                                         trans, inode_unlinked_and_not_open,
1278                                       "inode %llu:%u unlinked and not open",
1279                                       u.bi_inum, u.bi_snapshot)) {
1280                                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1281                                 bch_err_msg(c, ret, "in fsck deleting inode");
1282                                 goto err_noprint;
1283                         }
1284                         ret = 0;
1285                 }
1286         }
1287
1288         if (fsck_err_on(u.bi_parent_subvol &&
1289                         (u.bi_subvol == 0 ||
1290                          u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1291                         trans, inode_bi_parent_nonzero,
1292                         "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1293                         u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1294                 u.bi_parent_subvol = 0;
1295                 do_update = true;
1296         }
1297
1298         if (u.bi_subvol) {
1299                 struct bch_subvolume s;
1300
1301                 ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1302                 if (ret && !bch2_err_matches(ret, ENOENT))
1303                         goto err;
1304
1305                 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1306                         ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1307                         goto do_update;
1308                 }
1309
1310                 if (fsck_err_on(ret,
1311                                 trans, inode_bi_subvol_missing,
1312                                 "inode %llu:%u bi_subvol points to missing subvolume %u",
1313                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1314                     fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1315                                 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1316                                                            k.k->p.snapshot),
1317                                 trans, inode_bi_subvol_wrong,
1318                                 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1319                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1320                                 le64_to_cpu(s.inode),
1321                                 le32_to_cpu(s.snapshot))) {
1322                         u.bi_subvol = 0;
1323                         u.bi_parent_subvol = 0;
1324                         do_update = true;
1325                 }
1326         }
1327
1328         if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1329                         trans, inode_journal_seq_in_future,
1330                         "inode journal seq in future (currently at %llu)\n%s",
1331                         journal_cur_seq(&c->journal),
1332                         (printbuf_reset(&buf),
1333                          bch2_inode_unpacked_to_text(&buf, &u),
1334                         buf.buf))) {
1335                 u.bi_journal_seq = journal_cur_seq(&c->journal);
1336                 do_update = true;
1337         }
1338
1339         if (S_ISDIR(u.bi_mode)) {
1340                 ret = check_directory_size(trans, &u, k, &do_update);
1341
1342                 fsck_err_on(ret,
1343                             trans, directory_size_mismatch,
1344                             "directory inode %llu:%u with the mismatch directory size",
1345                             u.bi_inum, k.k->p.snapshot);
1346                 ret = 0;
1347         }
1348 do_update:
1349         if (do_update) {
1350                 ret = __bch2_fsck_write_inode(trans, &u);
1351                 bch_err_msg(c, ret, "in fsck updating inode");
1352                 if (ret)
1353                         goto err_noprint;
1354         }
1355 err:
1356 fsck_err:
1357         bch_err_fn(c, ret);
1358 err_noprint:
1359         printbuf_exit(&buf);
1360         return ret;
1361 }
1362
1363 int bch2_check_inodes(struct bch_fs *c)
1364 {
1365         struct bch_inode_unpacked snapshot_root = {};
1366         struct snapshots_seen s;
1367
1368         snapshots_seen_init(&s);
1369
1370         int ret = bch2_trans_run(c,
1371                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1372                                 POS_MIN,
1373                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1374                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1375                         check_inode(trans, &iter, k, &snapshot_root, &s)));
1376
1377         snapshots_seen_exit(&s);
1378         bch_err_fn(c, ret);
1379         return ret;
1380 }
1381
1382 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1383                                             struct bch_inode_unpacked *inode)
1384 {
1385         struct bch_fs *c = trans->c;
1386         struct btree_iter iter;
1387         struct bkey_s_c k;
1388         int ret = 0;
1389
1390         /*
1391          * We look for inodes to reattach in natural key order, leaves first,
1392          * but we should do the reattach at the oldest version that needs to be
1393          * reattached:
1394          */
1395         for_each_btree_key_norestart(trans, iter,
1396                                      BTREE_ID_inodes,
1397                                      SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1398                                      BTREE_ITER_all_snapshots, k, ret) {
1399                 if (k.k->p.offset != inode->bi_inum)
1400                         break;
1401
1402                 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1403                         continue;
1404
1405                 if (!bkey_is_inode(k.k))
1406                         break;
1407
1408                 struct bch_inode_unpacked parent_inode;
1409                 ret = bch2_inode_unpack(k, &parent_inode);
1410                 if (ret)
1411                         break;
1412
1413                 if (!inode_should_reattach(&parent_inode))
1414                         break;
1415
1416                 *inode = parent_inode;
1417         }
1418         bch2_trans_iter_exit(trans, &iter);
1419
1420         return ret;
1421 }
1422
1423 static int check_unreachable_inode(struct btree_trans *trans,
1424                                    struct btree_iter *iter,
1425                                    struct bkey_s_c k)
1426 {
1427         struct printbuf buf = PRINTBUF;
1428         int ret = 0;
1429
1430         if (!bkey_is_inode(k.k))
1431                 return 0;
1432
1433         struct bch_inode_unpacked inode;
1434         ret = bch2_inode_unpack(k, &inode);
1435         if (ret)
1436                 return ret;
1437
1438         if (!inode_should_reattach(&inode))
1439                 return 0;
1440
1441         ret = find_oldest_inode_needs_reattach(trans, &inode);
1442         if (ret)
1443                 return ret;
1444
1445         if (fsck_err(trans, inode_unreachable,
1446                      "unreachable inode:\n%s",
1447                      (bch2_inode_unpacked_to_text(&buf, &inode),
1448                       buf.buf)))
1449                 ret = reattach_inode(trans, &inode);
1450 fsck_err:
1451         printbuf_exit(&buf);
1452         return ret;
1453 }
1454
1455 /*
1456  * Reattach unreachable (but not unlinked) inodes
1457  *
1458  * Run after check_inodes() and check_dirents(), so we node that inode
1459  * backpointer fields point to valid dirents, and every inode that has a dirent
1460  * that points to it has its backpointer field set - so we're just looking for
1461  * non-unlinked inodes without backpointers:
1462  *
1463  * XXX: this is racy w.r.t. hardlink removal in online fsck
1464  */
1465 int bch2_check_unreachable_inodes(struct bch_fs *c)
1466 {
1467         int ret = bch2_trans_run(c,
1468                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1469                                 POS_MIN,
1470                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1471                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1472                         check_unreachable_inode(trans, &iter, k)));
1473         bch_err_fn(c, ret);
1474         return ret;
1475 }
1476
1477 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1478 {
1479         switch (btree) {
1480         case BTREE_ID_extents:
1481                 return S_ISREG(mode) || S_ISLNK(mode);
1482         case BTREE_ID_dirents:
1483                 return S_ISDIR(mode);
1484         case BTREE_ID_xattrs:
1485                 return true;
1486         default:
1487                 BUG();
1488         }
1489 }
1490
1491 static int check_key_has_inode(struct btree_trans *trans,
1492                                struct btree_iter *iter,
1493                                struct inode_walker *inode,
1494                                struct inode_walker_entry *i,
1495                                struct bkey_s_c k)
1496 {
1497         struct bch_fs *c = trans->c;
1498         struct printbuf buf = PRINTBUF;
1499         int ret = PTR_ERR_OR_ZERO(i);
1500         if (ret)
1501                 return ret;
1502
1503         if (k.k->type == KEY_TYPE_whiteout)
1504                 goto out;
1505
1506         if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1507                 ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1508                         bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1509                 if (ret)
1510                         goto err;
1511
1512                 inode->last_pos.inode--;
1513                 ret = -BCH_ERR_transaction_restart_nested;
1514                 goto err;
1515         }
1516
1517         if (fsck_err_on(!i,
1518                         trans, key_in_missing_inode,
1519                         "key in missing inode:\n  %s",
1520                         (printbuf_reset(&buf),
1521                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1522                 goto delete;
1523
1524         if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1525                         trans, key_in_wrong_inode_type,
1526                         "key for wrong inode mode %o:\n  %s",
1527                         i->inode.bi_mode,
1528                         (printbuf_reset(&buf),
1529                          bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1530                 goto delete;
1531 out:
1532 err:
1533 fsck_err:
1534         printbuf_exit(&buf);
1535         bch_err_fn(c, ret);
1536         return ret;
1537 delete:
1538         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1539         goto out;
1540 }
1541
1542 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1543 {
1544         struct bch_fs *c = trans->c;
1545         int ret = 0;
1546         s64 count2;
1547
1548         darray_for_each(w->inodes, i) {
1549                 if (i->inode.bi_sectors == i->count)
1550                         continue;
1551
1552                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1553
1554                 if (w->recalculate_sums)
1555                         i->count = count2;
1556
1557                 if (i->count != count2) {
1558                         bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1559                                             w->last_pos.inode, i->snapshot, i->count, count2);
1560                         i->count = count2;
1561                 }
1562
1563                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1564                                 trans, inode_i_sectors_wrong,
1565                                 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1566                                 w->last_pos.inode, i->snapshot,
1567                                 i->inode.bi_sectors, i->count)) {
1568                         i->inode.bi_sectors = i->count;
1569                         ret = bch2_fsck_write_inode(trans, &i->inode);
1570                         if (ret)
1571                                 break;
1572                 }
1573         }
1574 fsck_err:
1575         bch_err_fn(c, ret);
1576         return ret;
1577 }
1578
1579 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1580 {
1581         u32 restart_count = trans->restart_count;
1582         return check_i_sectors_notnested(trans, w) ?:
1583                 trans_was_restarted(trans, restart_count);
1584 }
1585
1586 struct extent_end {
1587         u32                     snapshot;
1588         u64                     offset;
1589         struct snapshots_seen   seen;
1590 };
1591
1592 struct extent_ends {
1593         struct bpos                     last_pos;
1594         DARRAY(struct extent_end)       e;
1595 };
1596
1597 static void extent_ends_reset(struct extent_ends *extent_ends)
1598 {
1599         darray_for_each(extent_ends->e, i)
1600                 snapshots_seen_exit(&i->seen);
1601         extent_ends->e.nr = 0;
1602 }
1603
1604 static void extent_ends_exit(struct extent_ends *extent_ends)
1605 {
1606         extent_ends_reset(extent_ends);
1607         darray_exit(&extent_ends->e);
1608 }
1609
1610 static void extent_ends_init(struct extent_ends *extent_ends)
1611 {
1612         memset(extent_ends, 0, sizeof(*extent_ends));
1613 }
1614
1615 static int extent_ends_at(struct bch_fs *c,
1616                           struct extent_ends *extent_ends,
1617                           struct snapshots_seen *seen,
1618                           struct bkey_s_c k)
1619 {
1620         struct extent_end *i, n = (struct extent_end) {
1621                 .offset         = k.k->p.offset,
1622                 .snapshot       = k.k->p.snapshot,
1623                 .seen           = *seen,
1624         };
1625
1626         n.seen.ids.data = kmemdup(seen->ids.data,
1627                               sizeof(seen->ids.data[0]) * seen->ids.size,
1628                               GFP_KERNEL);
1629         if (!n.seen.ids.data)
1630                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1631
1632         __darray_for_each(extent_ends->e, i) {
1633                 if (i->snapshot == k.k->p.snapshot) {
1634                         snapshots_seen_exit(&i->seen);
1635                         *i = n;
1636                         return 0;
1637                 }
1638
1639                 if (i->snapshot >= k.k->p.snapshot)
1640                         break;
1641         }
1642
1643         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1644 }
1645
1646 static int overlapping_extents_found(struct btree_trans *trans,
1647                                      enum btree_id btree,
1648                                      struct bpos pos1, struct snapshots_seen *pos1_seen,
1649                                      struct bkey pos2,
1650                                      bool *fixed,
1651                                      struct extent_end *extent_end)
1652 {
1653         struct bch_fs *c = trans->c;
1654         struct printbuf buf = PRINTBUF;
1655         struct btree_iter iter1, iter2 = { NULL };
1656         struct bkey_s_c k1, k2;
1657         int ret;
1658
1659         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1660
1661         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1662                              BTREE_ITER_all_snapshots|
1663                              BTREE_ITER_not_extents);
1664         k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
1665         ret = bkey_err(k1);
1666         if (ret)
1667                 goto err;
1668
1669         prt_str(&buf, "\n  ");
1670         bch2_bkey_val_to_text(&buf, c, k1);
1671
1672         if (!bpos_eq(pos1, k1.k->p)) {
1673                 prt_str(&buf, "\n  wanted\n  ");
1674                 bch2_bpos_to_text(&buf, pos1);
1675                 prt_str(&buf, "\n  ");
1676                 bch2_bkey_to_text(&buf, &pos2);
1677
1678                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1679                         __func__, buf.buf);
1680                 ret = -BCH_ERR_internal_fsck_err;
1681                 goto err;
1682         }
1683
1684         bch2_trans_copy_iter(&iter2, &iter1);
1685
1686         while (1) {
1687                 bch2_btree_iter_advance(&iter2);
1688
1689                 k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
1690                 ret = bkey_err(k2);
1691                 if (ret)
1692                         goto err;
1693
1694                 if (bpos_ge(k2.k->p, pos2.p))
1695                         break;
1696         }
1697
1698         prt_str(&buf, "\n  ");
1699         bch2_bkey_val_to_text(&buf, c, k2);
1700
1701         if (bpos_gt(k2.k->p, pos2.p) ||
1702             pos2.size != k2.k->size) {
1703                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1704                         __func__, buf.buf);
1705                 ret = -BCH_ERR_internal_fsck_err;
1706                 goto err;
1707         }
1708
1709         prt_printf(&buf, "\n  overwriting %s extent",
1710                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1711
1712         if (fsck_err(trans, extent_overlapping,
1713                      "overlapping extents%s", buf.buf)) {
1714                 struct btree_iter *old_iter = &iter1;
1715                 struct disk_reservation res = { 0 };
1716
1717                 if (pos1.snapshot < pos2.p.snapshot) {
1718                         old_iter = &iter2;
1719                         swap(k1, k2);
1720                 }
1721
1722                 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1723
1724                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1725                                 BTREE_UPDATE_internal_snapshot_node,
1726                                 k1, k2) ?:
1727                         bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1728                 bch2_disk_reservation_put(c, &res);
1729
1730                 if (ret)
1731                         goto err;
1732
1733                 *fixed = true;
1734
1735                 if (pos1.snapshot == pos2.p.snapshot) {
1736                         /*
1737                          * We overwrote the first extent, and did the overwrite
1738                          * in the same snapshot:
1739                          */
1740                         extent_end->offset = bkey_start_offset(&pos2);
1741                 } else if (pos1.snapshot > pos2.p.snapshot) {
1742                         /*
1743                          * We overwrote the first extent in pos2's snapshot:
1744                          */
1745                         ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1746                 } else {
1747                         /*
1748                          * We overwrote the second extent - restart
1749                          * check_extent() from the top:
1750                          */
1751                         ret = -BCH_ERR_transaction_restart_nested;
1752                 }
1753         }
1754 fsck_err:
1755 err:
1756         bch2_trans_iter_exit(trans, &iter2);
1757         bch2_trans_iter_exit(trans, &iter1);
1758         printbuf_exit(&buf);
1759         return ret;
1760 }
1761
1762 static int check_overlapping_extents(struct btree_trans *trans,
1763                               struct snapshots_seen *seen,
1764                               struct extent_ends *extent_ends,
1765                               struct bkey_s_c k,
1766                               struct btree_iter *iter,
1767                               bool *fixed)
1768 {
1769         struct bch_fs *c = trans->c;
1770         int ret = 0;
1771
1772         /* transaction restart, running again */
1773         if (bpos_eq(extent_ends->last_pos, k.k->p))
1774                 return 0;
1775
1776         if (extent_ends->last_pos.inode != k.k->p.inode)
1777                 extent_ends_reset(extent_ends);
1778
1779         darray_for_each(extent_ends->e, i) {
1780                 if (i->offset <= bkey_start_offset(k.k))
1781                         continue;
1782
1783                 if (!ref_visible2(c,
1784                                   k.k->p.snapshot, seen,
1785                                   i->snapshot, &i->seen))
1786                         continue;
1787
1788                 ret = overlapping_extents_found(trans, iter->btree_id,
1789                                                 SPOS(iter->pos.inode,
1790                                                      i->offset,
1791                                                      i->snapshot),
1792                                                 &i->seen,
1793                                                 *k.k, fixed, i);
1794                 if (ret)
1795                         goto err;
1796         }
1797
1798         extent_ends->last_pos = k.k->p;
1799 err:
1800         return ret;
1801 }
1802
1803 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1804                                 struct bkey_s_c k)
1805 {
1806         struct bch_fs *c = trans->c;
1807         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1808         struct bch_extent_crc_unpacked crc;
1809         const union bch_extent_entry *i;
1810         unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1811
1812         bkey_for_each_crc(k.k, ptrs, crc, i)
1813                 if (crc_is_encoded(crc) &&
1814                     crc.uncompressed_size > encoded_extent_max_sectors) {
1815                         struct printbuf buf = PRINTBUF;
1816
1817                         bch2_bkey_val_to_text(&buf, c, k);
1818                         bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1819                         printbuf_exit(&buf);
1820                 }
1821
1822         return 0;
1823 }
1824
1825 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1826                         struct bkey_s_c k,
1827                         struct inode_walker *inode,
1828                         struct snapshots_seen *s,
1829                         struct extent_ends *extent_ends,
1830                         struct disk_reservation *res)
1831 {
1832         struct bch_fs *c = trans->c;
1833         struct printbuf buf = PRINTBUF;
1834         int ret = 0;
1835
1836         ret = bch2_check_key_has_snapshot(trans, iter, k);
1837         if (ret) {
1838                 ret = ret < 0 ? ret : 0;
1839                 goto out;
1840         }
1841
1842         if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1843                 ret = check_i_sectors(trans, inode);
1844                 if (ret)
1845                         goto err;
1846         }
1847
1848         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1849         if (ret)
1850                 goto err;
1851
1852         struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1853         ret = PTR_ERR_OR_ZERO(extent_i);
1854         if (ret)
1855                 goto err;
1856
1857         ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1858         if (ret)
1859                 goto err;
1860
1861         if (k.k->type != KEY_TYPE_whiteout) {
1862                 ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1863                                                 &inode->recalculate_sums);
1864                 if (ret)
1865                         goto err;
1866
1867                 /*
1868                  * Check inodes in reverse order, from oldest snapshots to
1869                  * newest, starting from the inode that matches this extent's
1870                  * snapshot. If we didn't have one, iterate over all inodes:
1871                  */
1872                 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1873                      inode->inodes.data && i >= inode->inodes.data;
1874                      --i) {
1875                         if (i->snapshot > k.k->p.snapshot ||
1876                             !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1877                                 continue;
1878
1879                         if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1880                                         !bkey_extent_is_reservation(k),
1881                                         trans, extent_past_end_of_inode,
1882                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1883                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1884                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1885                                 struct btree_iter iter2;
1886
1887                                 bch2_trans_copy_iter(&iter2, iter);
1888                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1889                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1890                                         bch2_btree_delete_at(trans, &iter2,
1891                                                 BTREE_UPDATE_internal_snapshot_node);
1892                                 bch2_trans_iter_exit(trans, &iter2);
1893                                 if (ret)
1894                                         goto err;
1895
1896                                 iter->k.type = KEY_TYPE_whiteout;
1897                                 break;
1898                         }
1899                 }
1900         }
1901
1902         ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1903         if (ret)
1904                 goto err;
1905
1906         if (bkey_extent_is_allocation(k.k)) {
1907                 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1908                      inode->inodes.data && i >= inode->inodes.data;
1909                      --i) {
1910                         if (i->snapshot > k.k->p.snapshot ||
1911                             !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1912                                 continue;
1913
1914                         i->count += k.k->size;
1915                 }
1916         }
1917
1918         if (k.k->type != KEY_TYPE_whiteout) {
1919                 ret = extent_ends_at(c, extent_ends, s, k);
1920                 if (ret)
1921                         goto err;
1922         }
1923 out:
1924 err:
1925 fsck_err:
1926         printbuf_exit(&buf);
1927         bch_err_fn(c, ret);
1928         return ret;
1929 }
1930
1931 /*
1932  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1933  * that i_size an i_sectors are consistent
1934  */
1935 int bch2_check_extents(struct bch_fs *c)
1936 {
1937         struct inode_walker w = inode_walker_init();
1938         struct snapshots_seen s;
1939         struct extent_ends extent_ends;
1940         struct disk_reservation res = { 0 };
1941
1942         snapshots_seen_init(&s);
1943         extent_ends_init(&extent_ends);
1944
1945         int ret = bch2_trans_run(c,
1946                 for_each_btree_key(trans, iter, BTREE_ID_extents,
1947                                 POS(BCACHEFS_ROOT_INO, 0),
1948                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
1949                         bch2_disk_reservation_put(c, &res);
1950                         check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
1951                         check_extent_overbig(trans, &iter, k);
1952                 })) ?:
1953                 check_i_sectors_notnested(trans, &w));
1954
1955         bch2_disk_reservation_put(c, &res);
1956         extent_ends_exit(&extent_ends);
1957         inode_walker_exit(&w);
1958         snapshots_seen_exit(&s);
1959
1960         bch_err_fn(c, ret);
1961         return ret;
1962 }
1963
1964 int bch2_check_indirect_extents(struct bch_fs *c)
1965 {
1966         struct disk_reservation res = { 0 };
1967
1968         int ret = bch2_trans_run(c,
1969                 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1970                                 POS_MIN,
1971                                 BTREE_ITER_prefetch, k,
1972                                 &res, NULL,
1973                                 BCH_TRANS_COMMIT_no_enospc, ({
1974                         bch2_disk_reservation_put(c, &res);
1975                         check_extent_overbig(trans, &iter, k);
1976                 })));
1977
1978         bch2_disk_reservation_put(c, &res);
1979         bch_err_fn(c, ret);
1980         return ret;
1981 }
1982
1983 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1984 {
1985         struct bch_fs *c = trans->c;
1986         int ret = 0;
1987         s64 count2;
1988
1989         darray_for_each(w->inodes, i) {
1990                 if (i->inode.bi_nlink == i->count)
1991                         continue;
1992
1993                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1994                 if (count2 < 0)
1995                         return count2;
1996
1997                 if (i->count != count2) {
1998                         bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1999                                             w->last_pos.inode, i->snapshot, i->count, count2);
2000                         i->count = count2;
2001                         if (i->inode.bi_nlink == i->count)
2002                                 continue;
2003                 }
2004
2005                 if (fsck_err_on(i->inode.bi_nlink != i->count,
2006                                 trans, inode_dir_wrong_nlink,
2007                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
2008                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
2009                         i->inode.bi_nlink = i->count;
2010                         ret = bch2_fsck_write_inode(trans, &i->inode);
2011                         if (ret)
2012                                 break;
2013                 }
2014         }
2015 fsck_err:
2016         bch_err_fn(c, ret);
2017         return ret;
2018 }
2019
2020 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
2021 {
2022         u32 restart_count = trans->restart_count;
2023         return check_subdir_count_notnested(trans, w) ?:
2024                 trans_was_restarted(trans, restart_count);
2025 }
2026
2027 noinline_for_stack
2028 static int check_dirent_inode_dirent(struct btree_trans *trans,
2029                                    struct btree_iter *iter,
2030                                    struct bkey_s_c_dirent d,
2031                                    struct bch_inode_unpacked *target)
2032 {
2033         struct bch_fs *c = trans->c;
2034         struct printbuf buf = PRINTBUF;
2035         struct btree_iter bp_iter = { NULL };
2036         int ret = 0;
2037
2038         if (inode_points_to_dirent(target, d))
2039                 return 0;
2040
2041         if (!target->bi_dir &&
2042             !target->bi_dir_offset) {
2043                 fsck_err_on(S_ISDIR(target->bi_mode),
2044                             trans, inode_dir_missing_backpointer,
2045                             "directory with missing backpointer\n%s",
2046                             (printbuf_reset(&buf),
2047                              bch2_bkey_val_to_text(&buf, c, d.s_c),
2048                              prt_printf(&buf, "\n"),
2049                              bch2_inode_unpacked_to_text(&buf, target),
2050                              buf.buf));
2051
2052                 fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
2053                             trans, inode_unlinked_but_has_dirent,
2054                             "inode unlinked but has dirent\n%s",
2055                             (printbuf_reset(&buf),
2056                              bch2_bkey_val_to_text(&buf, c, d.s_c),
2057                              prt_printf(&buf, "\n"),
2058                              bch2_inode_unpacked_to_text(&buf, target),
2059                              buf.buf));
2060
2061                 target->bi_flags &= ~BCH_INODE_unlinked;
2062                 target->bi_dir          = d.k->p.inode;
2063                 target->bi_dir_offset   = d.k->p.offset;
2064                 return __bch2_fsck_write_inode(trans, target);
2065         }
2066
2067         if (bch2_inode_should_have_single_bp(target) &&
2068             !fsck_err(trans, inode_wrong_backpointer,
2069                       "dirent points to inode that does not point back:\n  %s",
2070                       (bch2_bkey_val_to_text(&buf, c, d.s_c),
2071                        prt_printf(&buf, "\n  "),
2072                        bch2_inode_unpacked_to_text(&buf, target),
2073                        buf.buf)))
2074                 goto err;
2075
2076         struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
2077                               SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
2078         ret = bkey_err(bp_dirent);
2079         if (ret && !bch2_err_matches(ret, ENOENT))
2080                 goto err;
2081
2082         bool backpointer_exists = !ret;
2083         ret = 0;
2084
2085         if (fsck_err_on(!backpointer_exists,
2086                         trans, inode_wrong_backpointer,
2087                         "inode %llu:%u has wrong backpointer:\n"
2088                         "got       %llu:%llu\n"
2089                         "should be %llu:%llu",
2090                         target->bi_inum, target->bi_snapshot,
2091                         target->bi_dir,
2092                         target->bi_dir_offset,
2093                         d.k->p.inode,
2094                         d.k->p.offset)) {
2095                 target->bi_dir          = d.k->p.inode;
2096                 target->bi_dir_offset   = d.k->p.offset;
2097                 ret = __bch2_fsck_write_inode(trans, target);
2098                 goto out;
2099         }
2100
2101         bch2_bkey_val_to_text(&buf, c, d.s_c);
2102         prt_newline(&buf);
2103         if (backpointer_exists)
2104                 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
2105
2106         if (fsck_err_on(backpointer_exists &&
2107                         (S_ISDIR(target->bi_mode) ||
2108                          target->bi_subvol),
2109                         trans, inode_dir_multiple_links,
2110                         "%s %llu:%u with multiple links\n%s",
2111                         S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
2112                         target->bi_inum, target->bi_snapshot, buf.buf)) {
2113                 ret = __remove_dirent(trans, d.k->p);
2114                 goto out;
2115         }
2116
2117         /*
2118          * hardlinked file with nlink 0:
2119          * We're just adjusting nlink here so check_nlinks() will pick
2120          * it up, it ignores inodes with nlink 0
2121          */
2122         if (fsck_err_on(backpointer_exists && !target->bi_nlink,
2123                         trans, inode_multiple_links_but_nlink_0,
2124                         "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
2125                         target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
2126                 target->bi_nlink++;
2127                 target->bi_flags &= ~BCH_INODE_unlinked;
2128                 ret = __bch2_fsck_write_inode(trans, target);
2129                 if (ret)
2130                         goto err;
2131         }
2132 out:
2133 err:
2134 fsck_err:
2135         bch2_trans_iter_exit(trans, &bp_iter);
2136         printbuf_exit(&buf);
2137         bch_err_fn(c, ret);
2138         return ret;
2139 }
2140
2141 noinline_for_stack
2142 static int check_dirent_target(struct btree_trans *trans,
2143                                struct btree_iter *iter,
2144                                struct bkey_s_c_dirent d,
2145                                struct bch_inode_unpacked *target)
2146 {
2147         struct bch_fs *c = trans->c;
2148         struct bkey_i_dirent *n;
2149         struct printbuf buf = PRINTBUF;
2150         int ret = 0;
2151
2152         ret = check_dirent_inode_dirent(trans, iter, d, target);
2153         if (ret)
2154                 goto err;
2155
2156         if (fsck_err_on(d.v->d_type != inode_d_type(target),
2157                         trans, dirent_d_type_wrong,
2158                         "incorrect d_type: got %s, should be %s:\n%s",
2159                         bch2_d_type_str(d.v->d_type),
2160                         bch2_d_type_str(inode_d_type(target)),
2161                         (printbuf_reset(&buf),
2162                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2163                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
2164                 ret = PTR_ERR_OR_ZERO(n);
2165                 if (ret)
2166                         goto err;
2167
2168                 bkey_reassemble(&n->k_i, d.s_c);
2169                 n->v.d_type = inode_d_type(target);
2170                 if (n->v.d_type == DT_SUBVOL) {
2171                         n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
2172                         n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
2173                 } else {
2174                         n->v.d_inum = cpu_to_le64(target->bi_inum);
2175                 }
2176
2177                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
2178                 if (ret)
2179                         goto err;
2180
2181                 d = dirent_i_to_s_c(n);
2182         }
2183 err:
2184 fsck_err:
2185         printbuf_exit(&buf);
2186         bch_err_fn(c, ret);
2187         return ret;
2188 }
2189
2190 /* find a subvolume that's a descendent of @snapshot: */
2191 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
2192 {
2193         struct btree_iter iter;
2194         struct bkey_s_c k;
2195         int ret;
2196
2197         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
2198                 if (k.k->type != KEY_TYPE_subvolume)
2199                         continue;
2200
2201                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2202                 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
2203                         bch2_trans_iter_exit(trans, &iter);
2204                         *subvolid = k.k->p.offset;
2205                         goto found;
2206                 }
2207         }
2208         if (!ret)
2209                 ret = -ENOENT;
2210 found:
2211         bch2_trans_iter_exit(trans, &iter);
2212         return ret;
2213 }
2214
2215 noinline_for_stack
2216 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
2217                                   struct bkey_s_c_dirent d)
2218 {
2219         struct bch_fs *c = trans->c;
2220         struct btree_iter subvol_iter = {};
2221         struct bch_inode_unpacked subvol_root;
2222         u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
2223         u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
2224         u32 parent_snapshot;
2225         u32 new_parent_subvol = 0;
2226         u64 parent_inum;
2227         struct printbuf buf = PRINTBUF;
2228         int ret = 0;
2229
2230         ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
2231         if (ret && !bch2_err_matches(ret, ENOENT))
2232                 return ret;
2233
2234         if (ret ||
2235             (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
2236                 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
2237                 if (ret2 && !bch2_err_matches(ret, ENOENT))
2238                         return ret2;
2239         }
2240
2241         if (ret &&
2242             !new_parent_subvol &&
2243             (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
2244                 /*
2245                  * Couldn't find a subvol for dirent's snapshot - but we lost
2246                  * subvols, so we need to reconstruct:
2247                  */
2248                 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
2249                 if (ret)
2250                         return ret;
2251
2252                 parent_snapshot = d.k->p.snapshot;
2253         }
2254
2255         if (fsck_err_on(ret,
2256                         trans, dirent_to_missing_parent_subvol,
2257                         "dirent parent_subvol points to missing subvolume\n%s",
2258                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2259             fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2260                         trans, dirent_not_visible_in_parent_subvol,
2261                         "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2262                         parent_snapshot,
2263                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2264                 if (!new_parent_subvol) {
2265                         bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2266                         return -BCH_ERR_fsck_repair_unimplemented;
2267                 }
2268
2269                 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2270                 ret = PTR_ERR_OR_ZERO(new_dirent);
2271                 if (ret)
2272                         goto err;
2273
2274                 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2275         }
2276
2277         struct bkey_s_c_subvolume s =
2278                 bch2_bkey_get_iter_typed(trans, &subvol_iter,
2279                                          BTREE_ID_subvolumes, POS(0, target_subvol),
2280                                          0, subvolume);
2281         ret = bkey_err(s.s_c);
2282         if (ret && !bch2_err_matches(ret, ENOENT))
2283                 return ret;
2284
2285         if (ret) {
2286                 if (fsck_err(trans, dirent_to_missing_subvol,
2287                              "dirent points to missing subvolume\n%s",
2288                              (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2289                         return __remove_dirent(trans, d.k->p);
2290                 ret = 0;
2291                 goto out;
2292         }
2293
2294         if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2295                         trans, subvol_fs_path_parent_wrong,
2296                         "subvol with wrong fs_path_parent, should be be %u\n%s",
2297                         parent_subvol,
2298                         (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2299                 struct bkey_i_subvolume *n =
2300                         bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2301                 ret = PTR_ERR_OR_ZERO(n);
2302                 if (ret)
2303                         goto err;
2304
2305                 n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2306         }
2307
2308         u64 target_inum = le64_to_cpu(s.v->inode);
2309         u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2310
2311         ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
2312         if (ret && !bch2_err_matches(ret, ENOENT))
2313                 goto err;
2314
2315         if (ret) {
2316                 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2317                 ret = -BCH_ERR_fsck_repair_unimplemented;
2318                 goto err;
2319         }
2320
2321         if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2322                         trans, inode_bi_parent_wrong,
2323                         "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2324                         target_inum,
2325                         subvol_root.bi_parent_subvol, parent_subvol)) {
2326                 subvol_root.bi_parent_subvol = parent_subvol;
2327                 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2328                 ret = __bch2_fsck_write_inode(trans, &subvol_root);
2329                 if (ret)
2330                         goto err;
2331         }
2332
2333         ret = check_dirent_target(trans, iter, d, &subvol_root);
2334         if (ret)
2335                 goto err;
2336 out:
2337 err:
2338 fsck_err:
2339         bch2_trans_iter_exit(trans, &subvol_iter);
2340         printbuf_exit(&buf);
2341         return ret;
2342 }
2343
2344 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2345                         struct bkey_s_c k,
2346                         struct bch_hash_info *hash_info,
2347                         struct inode_walker *dir,
2348                         struct inode_walker *target,
2349                         struct snapshots_seen *s)
2350 {
2351         struct bch_fs *c = trans->c;
2352         struct inode_walker_entry *i;
2353         struct printbuf buf = PRINTBUF;
2354         int ret = 0;
2355
2356         ret = bch2_check_key_has_snapshot(trans, iter, k);
2357         if (ret) {
2358                 ret = ret < 0 ? ret : 0;
2359                 goto out;
2360         }
2361
2362         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2363         if (ret)
2364                 goto err;
2365
2366         if (k.k->type == KEY_TYPE_whiteout)
2367                 goto out;
2368
2369         if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2370                 ret = check_subdir_count(trans, dir);
2371                 if (ret)
2372                         goto err;
2373         }
2374
2375         i = walk_inode(trans, dir, k);
2376         ret = PTR_ERR_OR_ZERO(i);
2377         if (ret < 0)
2378                 goto err;
2379
2380         ret = check_key_has_inode(trans, iter, dir, i, k);
2381         if (ret)
2382                 goto err;
2383
2384         if (!i)
2385                 goto out;
2386
2387         if (dir->first_this_inode)
2388                 *hash_info = bch2_hash_info_init(c, &i->inode);
2389         dir->first_this_inode = false;
2390
2391         ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
2392         if (ret < 0)
2393                 goto err;
2394         if (ret) {
2395                 /* dirent has been deleted */
2396                 ret = 0;
2397                 goto out;
2398         }
2399
2400         if (k.k->type != KEY_TYPE_dirent)
2401                 goto out;
2402
2403         struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2404
2405         if (d.v->d_type == DT_SUBVOL) {
2406                 ret = check_dirent_to_subvol(trans, iter, d);
2407                 if (ret)
2408                         goto err;
2409         } else {
2410                 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2411                 if (ret)
2412                         goto err;
2413
2414                 if (fsck_err_on(!target->inodes.nr,
2415                                 trans, dirent_to_missing_inode,
2416                                 "dirent points to missing inode:\n%s",
2417                                 (printbuf_reset(&buf),
2418                                  bch2_bkey_val_to_text(&buf, c, k),
2419                                  buf.buf))) {
2420                         ret = __remove_dirent(trans, d.k->p);
2421                         if (ret)
2422                                 goto err;
2423                 }
2424
2425                 darray_for_each(target->inodes, i) {
2426                         ret = check_dirent_target(trans, iter, d, &i->inode);
2427                         if (ret)
2428                                 goto err;
2429                 }
2430
2431                 darray_for_each(target->deletes, i)
2432                         if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2433                                         trans, dirent_to_overwritten_inode,
2434                                         "dirent points to inode overwritten in snapshot %u:\n%s",
2435                                         *i,
2436                                         (printbuf_reset(&buf),
2437                                          bch2_bkey_val_to_text(&buf, c, k),
2438                                          buf.buf))) {
2439                                 struct btree_iter delete_iter;
2440                                 bch2_trans_iter_init(trans, &delete_iter,
2441                                                      BTREE_ID_dirents,
2442                                                      SPOS(k.k->p.inode, k.k->p.offset, *i),
2443                                                      BTREE_ITER_intent);
2444                                 ret =   bch2_btree_iter_traverse(&delete_iter) ?:
2445                                         bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2446                                                           hash_info,
2447                                                           &delete_iter,
2448                                                           BTREE_UPDATE_internal_snapshot_node);
2449                                 bch2_trans_iter_exit(trans, &delete_iter);
2450                                 if (ret)
2451                                         goto err;
2452
2453                         }
2454         }
2455
2456         ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2457         if (ret)
2458                 goto err;
2459
2460         if (d.v->d_type == DT_DIR)
2461                 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
2462                         i->count++;
2463 out:
2464 err:
2465 fsck_err:
2466         printbuf_exit(&buf);
2467         bch_err_fn(c, ret);
2468         return ret;
2469 }
2470
2471 /*
2472  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2473  * validate d_type
2474  */
2475 int bch2_check_dirents(struct bch_fs *c)
2476 {
2477         struct inode_walker dir = inode_walker_init();
2478         struct inode_walker target = inode_walker_init();
2479         struct snapshots_seen s;
2480         struct bch_hash_info hash_info;
2481
2482         snapshots_seen_init(&s);
2483
2484         int ret = bch2_trans_run(c,
2485                 for_each_btree_key(trans, iter, BTREE_ID_dirents,
2486                                 POS(BCACHEFS_ROOT_INO, 0),
2487                                 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2488                         check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2489                 check_subdir_count_notnested(trans, &dir));
2490
2491         snapshots_seen_exit(&s);
2492         inode_walker_exit(&dir);
2493         inode_walker_exit(&target);
2494         bch_err_fn(c, ret);
2495         return ret;
2496 }
2497
2498 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2499                        struct bkey_s_c k,
2500                        struct bch_hash_info *hash_info,
2501                        struct inode_walker *inode)
2502 {
2503         struct bch_fs *c = trans->c;
2504         struct inode_walker_entry *i;
2505         int ret;
2506
2507         ret = bch2_check_key_has_snapshot(trans, iter, k);
2508         if (ret < 0)
2509                 return ret;
2510         if (ret)
2511                 return 0;
2512
2513         i = walk_inode(trans, inode, k);
2514         ret = PTR_ERR_OR_ZERO(i);
2515         if (ret)
2516                 return ret;
2517
2518         ret = check_key_has_inode(trans, iter, inode, i, k);
2519         if (ret)
2520                 return ret;
2521
2522         if (!i)
2523                 return 0;
2524
2525         if (inode->first_this_inode)
2526                 *hash_info = bch2_hash_info_init(c, &i->inode);
2527         inode->first_this_inode = false;
2528
2529         ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
2530         bch_err_fn(c, ret);
2531         return ret;
2532 }
2533
2534 /*
2535  * Walk xattrs: verify that they all have a corresponding inode
2536  */
2537 int bch2_check_xattrs(struct bch_fs *c)
2538 {
2539         struct inode_walker inode = inode_walker_init();
2540         struct bch_hash_info hash_info;
2541         int ret = 0;
2542
2543         ret = bch2_trans_run(c,
2544                 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2545                         POS(BCACHEFS_ROOT_INO, 0),
2546                         BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2547                         k,
2548                         NULL, NULL,
2549                         BCH_TRANS_COMMIT_no_enospc,
2550                 check_xattr(trans, &iter, k, &hash_info, &inode)));
2551
2552         inode_walker_exit(&inode);
2553         bch_err_fn(c, ret);
2554         return ret;
2555 }
2556
2557 static int check_root_trans(struct btree_trans *trans)
2558 {
2559         struct bch_fs *c = trans->c;
2560         struct bch_inode_unpacked root_inode;
2561         u32 snapshot;
2562         u64 inum;
2563         int ret;
2564
2565         ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2566         if (ret && !bch2_err_matches(ret, ENOENT))
2567                 return ret;
2568
2569         if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2570                                 "root subvol missing")) {
2571                 struct bkey_i_subvolume *root_subvol =
2572                         bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2573                 ret = PTR_ERR_OR_ZERO(root_subvol);
2574                 if (ret)
2575                         goto err;
2576
2577                 snapshot        = U32_MAX;
2578                 inum            = BCACHEFS_ROOT_INO;
2579
2580                 bkey_subvolume_init(&root_subvol->k_i);
2581                 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2582                 root_subvol->v.flags    = 0;
2583                 root_subvol->v.snapshot = cpu_to_le32(snapshot);
2584                 root_subvol->v.inode    = cpu_to_le64(inum);
2585                 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2586                 bch_err_msg(c, ret, "writing root subvol");
2587                 if (ret)
2588                         goto err;
2589         }
2590
2591         ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
2592         if (ret && !bch2_err_matches(ret, ENOENT))
2593                 return ret;
2594
2595         if (mustfix_fsck_err_on(ret,
2596                                 trans, root_dir_missing,
2597                                 "root directory missing") ||
2598             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2599                                 trans, root_inode_not_dir,
2600                                 "root inode not a directory")) {
2601                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2602                                 0, NULL);
2603                 root_inode.bi_inum = inum;
2604                 root_inode.bi_snapshot = snapshot;
2605
2606                 ret = __bch2_fsck_write_inode(trans, &root_inode);
2607                 bch_err_msg(c, ret, "writing root inode");
2608         }
2609 err:
2610 fsck_err:
2611         return ret;
2612 }
2613
2614 /* Get root directory, create if it doesn't exist: */
2615 int bch2_check_root(struct bch_fs *c)
2616 {
2617         int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2618                 check_root_trans(trans));
2619         bch_err_fn(c, ret);
2620         return ret;
2621 }
2622
2623 typedef DARRAY(u32) darray_u32;
2624
2625 static bool darray_u32_has(darray_u32 *d, u32 v)
2626 {
2627         darray_for_each(*d, i)
2628                 if (*i == v)
2629                         return true;
2630         return false;
2631 }
2632
2633 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2634 {
2635         struct bch_fs *c = trans->c;
2636         struct btree_iter parent_iter = {};
2637         darray_u32 subvol_path = {};
2638         struct printbuf buf = PRINTBUF;
2639         int ret = 0;
2640
2641         if (k.k->type != KEY_TYPE_subvolume)
2642                 return 0;
2643
2644         while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2645                 ret = darray_push(&subvol_path, k.k->p.offset);
2646                 if (ret)
2647                         goto err;
2648
2649                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2650
2651                 struct bch_inode_unpacked subvol_root;
2652                 ret = bch2_inode_find_by_inum_trans(trans,
2653                                         (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2654                                         &subvol_root);
2655                 if (ret)
2656                         break;
2657
2658                 u32 parent = le32_to_cpu(s.v->fs_path_parent);
2659
2660                 if (darray_u32_has(&subvol_path, parent)) {
2661                         if (fsck_err(c, subvol_loop, "subvolume loop"))
2662                                 ret = reattach_subvol(trans, s);
2663                         break;
2664                 }
2665
2666                 bch2_trans_iter_exit(trans, &parent_iter);
2667                 bch2_trans_iter_init(trans, &parent_iter,
2668                                      BTREE_ID_subvolumes, POS(0, parent), 0);
2669                 k = bch2_btree_iter_peek_slot(&parent_iter);
2670                 ret = bkey_err(k);
2671                 if (ret)
2672                         goto err;
2673
2674                 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2675                                 trans, subvol_unreachable,
2676                                 "unreachable subvolume %s",
2677                                 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2678                                  buf.buf))) {
2679                         ret = reattach_subvol(trans, s);
2680                         break;
2681                 }
2682         }
2683 fsck_err:
2684 err:
2685         printbuf_exit(&buf);
2686         darray_exit(&subvol_path);
2687         bch2_trans_iter_exit(trans, &parent_iter);
2688         return ret;
2689 }
2690
2691 int bch2_check_subvolume_structure(struct bch_fs *c)
2692 {
2693         int ret = bch2_trans_run(c,
2694                 for_each_btree_key_commit(trans, iter,
2695                                 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2696                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2697                         check_subvol_path(trans, &iter, k)));
2698         bch_err_fn(c, ret);
2699         return ret;
2700 }
2701
2702 struct pathbuf_entry {
2703         u64     inum;
2704         u32     snapshot;
2705 };
2706
2707 typedef DARRAY(struct pathbuf_entry) pathbuf;
2708
2709 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
2710                                       u32 new_depth)
2711 {
2712         struct btree_iter iter;
2713         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2714                                                SPOS(0, p->inum, p->snapshot), 0);
2715
2716         struct bch_inode_unpacked inode;
2717         int ret = bkey_err(k) ?:
2718                 !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2719                 : bch2_inode_unpack(k, &inode);
2720         if (ret)
2721                 goto err;
2722
2723         if (inode.bi_depth != new_depth) {
2724                 inode.bi_depth = new_depth;
2725                 ret = __bch2_fsck_write_inode(trans, &inode) ?:
2726                         bch2_trans_commit(trans, NULL, NULL, 0);
2727         }
2728 err:
2729         bch2_trans_iter_exit(trans, &iter);
2730         return ret;
2731 }
2732
2733 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
2734 {
2735         u32 restart_count = trans->restart_count;
2736         int ret = 0;
2737
2738         darray_for_each_reverse(*path, i) {
2739                 ret = nested_lockrestart_do(trans,
2740                                 bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
2741                 bch_err_fn(trans->c, ret);
2742                 if (ret)
2743                         break;
2744
2745                 new_bi_depth++;
2746         }
2747
2748         return ret ?: trans_was_restarted(trans, restart_count);
2749 }
2750
2751 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2752 {
2753         darray_for_each(*p, i)
2754                 if (i->inum     == inum &&
2755                     i->snapshot == snapshot)
2756                         return true;
2757         return false;
2758 }
2759
2760 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2761 {
2762         struct bch_fs *c = trans->c;
2763         struct btree_iter inode_iter = {};
2764         pathbuf path = {};
2765         struct printbuf buf = PRINTBUF;
2766         u32 snapshot = inode_k.k->p.snapshot;
2767         bool redo_bi_depth = false;
2768         u32 min_bi_depth = U32_MAX;
2769         int ret = 0;
2770
2771         struct bch_inode_unpacked inode;
2772         ret = bch2_inode_unpack(inode_k, &inode);
2773         if (ret)
2774                 return ret;
2775
2776         while (!inode.bi_subvol) {
2777                 struct btree_iter dirent_iter;
2778                 struct bkey_s_c_dirent d;
2779                 u32 parent_snapshot = snapshot;
2780
2781                 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2782                 ret = bkey_err(d.s_c);
2783                 if (ret && !bch2_err_matches(ret, ENOENT))
2784                         goto out;
2785
2786                 if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2787                         bch2_trans_iter_exit(trans, &dirent_iter);
2788
2789                 if (bch2_err_matches(ret, ENOENT)) {
2790                         printbuf_reset(&buf);
2791                         bch2_bkey_val_to_text(&buf, c, inode_k);
2792                         bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2793                                 bch2_err_str(ret), buf.buf);
2794                         goto out;
2795                 }
2796
2797                 bch2_trans_iter_exit(trans, &dirent_iter);
2798
2799                 ret = darray_push(&path, ((struct pathbuf_entry) {
2800                         .inum           = inode.bi_inum,
2801                         .snapshot       = snapshot,
2802                 }));
2803                 if (ret)
2804                         return ret;
2805
2806                 snapshot = parent_snapshot;
2807
2808                 bch2_trans_iter_exit(trans, &inode_iter);
2809                 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2810                                              SPOS(0, inode.bi_dir, snapshot), 0);
2811
2812                 struct bch_inode_unpacked parent_inode;
2813                 ret = bkey_err(inode_k) ?:
2814                         !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2815                         : bch2_inode_unpack(inode_k, &parent_inode);
2816                 if (ret) {
2817                         /* Should have been caught in dirents pass */
2818                         bch_err_msg(c, ret, "error looking up parent directory");
2819                         goto out;
2820                 }
2821
2822                 min_bi_depth = parent_inode.bi_depth;
2823
2824                 if (parent_inode.bi_depth < inode.bi_depth &&
2825                     min_bi_depth < U16_MAX)
2826                         break;
2827
2828                 inode = parent_inode;
2829                 snapshot = inode_k.k->p.snapshot;
2830                 redo_bi_depth = true;
2831
2832                 if (path_is_dup(&path, inode.bi_inum, snapshot)) {
2833                         /* XXX print path */
2834                         bch_err(c, "directory structure loop");
2835
2836                         darray_for_each(path, i)
2837                                 pr_err("%llu:%u", i->inum, i->snapshot);
2838                         pr_err("%llu:%u", inode.bi_inum, snapshot);
2839
2840                         if (fsck_err(trans, dir_loop, "directory structure loop")) {
2841                                 ret = remove_backpointer(trans, &inode);
2842                                 bch_err_msg(c, ret, "removing dirent");
2843                                 if (ret)
2844                                         break;
2845
2846                                 ret = reattach_inode(trans, &inode);
2847                                 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2848                         }
2849
2850                         goto out;
2851                 }
2852         }
2853
2854         if (inode.bi_subvol)
2855                 min_bi_depth = 0;
2856
2857         if (redo_bi_depth)
2858                 ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
2859 out:
2860 fsck_err:
2861         bch2_trans_iter_exit(trans, &inode_iter);
2862         darray_exit(&path);
2863         printbuf_exit(&buf);
2864         bch_err_fn(c, ret);
2865         return ret;
2866 }
2867
2868 /*
2869  * Check for loops in the directory structure: all other connectivity issues
2870  * have been fixed by prior passes
2871  */
2872 int bch2_check_directory_structure(struct bch_fs *c)
2873 {
2874         int ret = bch2_trans_run(c,
2875                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2876                                           BTREE_ITER_intent|
2877                                           BTREE_ITER_prefetch|
2878                                           BTREE_ITER_all_snapshots, k,
2879                                           NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2880                         if (!S_ISDIR(bkey_inode_mode(k)))
2881                                 continue;
2882
2883                         if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2884                                 continue;
2885
2886                         check_path_loop(trans, k);
2887                 })));
2888
2889         bch_err_fn(c, ret);
2890         return ret;
2891 }
2892
2893 struct nlink_table {
2894         size_t          nr;
2895         size_t          size;
2896
2897         struct nlink {
2898                 u64     inum;
2899                 u32     snapshot;
2900                 u32     count;
2901         }               *d;
2902 };
2903
2904 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2905                      u64 inum, u32 snapshot)
2906 {
2907         if (t->nr == t->size) {
2908                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2909                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2910
2911                 if (!d) {
2912                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2913                                 new_size);
2914                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2915                 }
2916
2917                 if (t->d)
2918                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2919                 kvfree(t->d);
2920
2921                 t->d = d;
2922                 t->size = new_size;
2923         }
2924
2925
2926         t->d[t->nr++] = (struct nlink) {
2927                 .inum           = inum,
2928                 .snapshot       = snapshot,
2929         };
2930
2931         return 0;
2932 }
2933
2934 static int nlink_cmp(const void *_l, const void *_r)
2935 {
2936         const struct nlink *l = _l;
2937         const struct nlink *r = _r;
2938
2939         return cmp_int(l->inum, r->inum);
2940 }
2941
2942 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2943                      struct nlink_table *links,
2944                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2945 {
2946         struct nlink *link, key = {
2947                 .inum = inum, .snapshot = U32_MAX,
2948         };
2949
2950         if (inum < range_start || inum >= range_end)
2951                 return;
2952
2953         link = __inline_bsearch(&key, links->d, links->nr,
2954                                 sizeof(links->d[0]), nlink_cmp);
2955         if (!link)
2956                 return;
2957
2958         while (link > links->d && link[0].inum == link[-1].inum)
2959                 --link;
2960
2961         for (; link < links->d + links->nr && link->inum == inum; link++)
2962                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2963                         link->count++;
2964                         if (link->snapshot >= snapshot)
2965                                 break;
2966                 }
2967 }
2968
2969 noinline_for_stack
2970 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2971                                        struct nlink_table *t,
2972                                        u64 start, u64 *end)
2973 {
2974         int ret = bch2_trans_run(c,
2975                 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2976                                    POS(0, start),
2977                                    BTREE_ITER_intent|
2978                                    BTREE_ITER_prefetch|
2979                                    BTREE_ITER_all_snapshots, k, ({
2980                         if (!bkey_is_inode(k.k))
2981                                 continue;
2982
2983                         /* Should never fail, checked by bch2_inode_invalid: */
2984                         struct bch_inode_unpacked u;
2985                         _ret3 = bch2_inode_unpack(k, &u);
2986                         if (_ret3)
2987                                 break;
2988
2989                         /*
2990                          * Backpointer and directory structure checks are sufficient for
2991                          * directories, since they can't have hardlinks:
2992                          */
2993                         if (S_ISDIR(u.bi_mode))
2994                                 continue;
2995
2996                         /*
2997                          * Previous passes ensured that bi_nlink is nonzero if
2998                          * it had multiple hardlinks:
2999                          */
3000                         if (!u.bi_nlink)
3001                                 continue;
3002
3003                         ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
3004                         if (ret) {
3005                                 *end = k.k->p.offset;
3006                                 ret = 0;
3007                                 break;
3008                         }
3009                         0;
3010                 })));
3011
3012         bch_err_fn(c, ret);
3013         return ret;
3014 }
3015
3016 noinline_for_stack
3017 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
3018                                      u64 range_start, u64 range_end)
3019 {
3020         struct snapshots_seen s;
3021
3022         snapshots_seen_init(&s);
3023
3024         int ret = bch2_trans_run(c,
3025                 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
3026                                    BTREE_ITER_intent|
3027                                    BTREE_ITER_prefetch|
3028                                    BTREE_ITER_all_snapshots, k, ({
3029                         ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
3030                         if (ret)
3031                                 break;
3032
3033                         if (k.k->type == KEY_TYPE_dirent) {
3034                                 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
3035
3036                                 if (d.v->d_type != DT_DIR &&
3037                                     d.v->d_type != DT_SUBVOL)
3038                                         inc_link(c, &s, links, range_start, range_end,
3039                                                  le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
3040                         }
3041                         0;
3042                 })));
3043
3044         snapshots_seen_exit(&s);
3045
3046         bch_err_fn(c, ret);
3047         return ret;
3048 }
3049
3050 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
3051                                      struct bkey_s_c k,
3052                                      struct nlink_table *links,
3053                                      size_t *idx, u64 range_end)
3054 {
3055         struct bch_inode_unpacked u;
3056         struct nlink *link = &links->d[*idx];
3057         int ret = 0;
3058
3059         if (k.k->p.offset >= range_end)
3060                 return 1;
3061
3062         if (!bkey_is_inode(k.k))
3063                 return 0;
3064
3065         ret = bch2_inode_unpack(k, &u);
3066         if (ret)
3067                 return ret;
3068
3069         if (S_ISDIR(u.bi_mode))
3070                 return 0;
3071
3072         if (!u.bi_nlink)
3073                 return 0;
3074
3075         while ((cmp_int(link->inum, k.k->p.offset) ?:
3076                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
3077                 BUG_ON(*idx == links->nr);
3078                 link = &links->d[++*idx];
3079         }
3080
3081         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
3082                         trans, inode_wrong_nlink,
3083                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
3084                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
3085                         bch2_inode_nlink_get(&u), link->count)) {
3086                 bch2_inode_nlink_set(&u, link->count);
3087                 ret = __bch2_fsck_write_inode(trans, &u);
3088         }
3089 fsck_err:
3090         return ret;
3091 }
3092
3093 noinline_for_stack
3094 static int check_nlinks_update_hardlinks(struct bch_fs *c,
3095                                struct nlink_table *links,
3096                                u64 range_start, u64 range_end)
3097 {
3098         size_t idx = 0;
3099
3100         int ret = bch2_trans_run(c,
3101                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
3102                                 POS(0, range_start),
3103                                 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
3104                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3105                         check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
3106         if (ret < 0) {
3107                 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
3108                 return ret;
3109         }
3110
3111         return 0;
3112 }
3113
3114 int bch2_check_nlinks(struct bch_fs *c)
3115 {
3116         struct nlink_table links = { 0 };
3117         u64 this_iter_range_start, next_iter_range_start = 0;
3118         int ret = 0;
3119
3120         do {
3121                 this_iter_range_start = next_iter_range_start;
3122                 next_iter_range_start = U64_MAX;
3123
3124                 ret = check_nlinks_find_hardlinks(c, &links,
3125                                                   this_iter_range_start,
3126                                                   &next_iter_range_start);
3127
3128                 ret = check_nlinks_walk_dirents(c, &links,
3129                                           this_iter_range_start,
3130                                           next_iter_range_start);
3131                 if (ret)
3132                         break;
3133
3134                 ret = check_nlinks_update_hardlinks(c, &links,
3135                                          this_iter_range_start,
3136                                          next_iter_range_start);
3137                 if (ret)
3138                         break;
3139
3140                 links.nr = 0;
3141         } while (next_iter_range_start != U64_MAX);
3142
3143         kvfree(links.d);
3144         bch_err_fn(c, ret);
3145         return ret;
3146 }
3147
3148 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
3149                              struct bkey_s_c k)
3150 {
3151         struct bkey_s_c_reflink_p p;
3152         struct bkey_i_reflink_p *u;
3153
3154         if (k.k->type != KEY_TYPE_reflink_p)
3155                 return 0;
3156
3157         p = bkey_s_c_to_reflink_p(k);
3158
3159         if (!p.v->front_pad && !p.v->back_pad)
3160                 return 0;
3161
3162         u = bch2_trans_kmalloc(trans, sizeof(*u));
3163         int ret = PTR_ERR_OR_ZERO(u);
3164         if (ret)
3165                 return ret;
3166
3167         bkey_reassemble(&u->k_i, k);
3168         u->v.front_pad  = 0;
3169         u->v.back_pad   = 0;
3170
3171         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
3172 }
3173
3174 int bch2_fix_reflink_p(struct bch_fs *c)
3175 {
3176         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
3177                 return 0;
3178
3179         int ret = bch2_trans_run(c,
3180                 for_each_btree_key_commit(trans, iter,
3181                                 BTREE_ID_extents, POS_MIN,
3182                                 BTREE_ITER_intent|BTREE_ITER_prefetch|
3183                                 BTREE_ITER_all_snapshots, k,
3184                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3185                         fix_reflink_p_key(trans, &iter, k)));
3186         bch_err_fn(c, ret);
3187         return ret;
3188 }
3189
3190 #ifndef NO_BCACHEFS_CHARDEV
3191
3192 struct fsck_thread {
3193         struct thread_with_stdio thr;
3194         struct bch_fs           *c;
3195         struct bch_opts         opts;
3196 };
3197
3198 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
3199 {
3200         struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
3201         kfree(thr);
3202 }
3203
3204 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
3205 {
3206         struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3207         struct bch_fs *c = thr->c;
3208
3209         int ret = PTR_ERR_OR_ZERO(c);
3210         if (ret)
3211                 return ret;
3212
3213         ret = bch2_fs_start(thr->c);
3214         if (ret)
3215                 goto err;
3216
3217         if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
3218                 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
3219                 ret |= 1;
3220         }
3221         if (test_bit(BCH_FS_error, &c->flags)) {
3222                 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
3223                 ret |= 4;
3224         }
3225 err:
3226         bch2_fs_stop(c);
3227         return ret;
3228 }
3229
3230 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
3231         .exit           = bch2_fsck_thread_exit,
3232         .fn             = bch2_fsck_offline_thread_fn,
3233 };
3234
3235 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
3236 {
3237         struct bch_ioctl_fsck_offline arg;
3238         struct fsck_thread *thr = NULL;
3239         darray_str(devs) = {};
3240         long ret = 0;
3241
3242         if (copy_from_user(&arg, user_arg, sizeof(arg)))
3243                 return -EFAULT;
3244
3245         if (arg.flags)
3246                 return -EINVAL;
3247
3248         if (!capable(CAP_SYS_ADMIN))
3249                 return -EPERM;
3250
3251         for (size_t i = 0; i < arg.nr_devs; i++) {
3252                 u64 dev_u64;
3253                 ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
3254                 if (ret)
3255                         goto err;
3256
3257                 char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3258                 ret = PTR_ERR_OR_ZERO(dev_str);
3259                 if (ret)
3260                         goto err;
3261
3262                 ret = darray_push(&devs, dev_str);
3263                 if (ret) {
3264                         kfree(dev_str);
3265                         goto err;
3266                 }
3267         }
3268
3269         thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3270         if (!thr) {
3271                 ret = -ENOMEM;
3272                 goto err;
3273         }
3274
3275         thr->opts = bch2_opts_empty();
3276
3277         if (arg.opts) {
3278                 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3279                 ret =   PTR_ERR_OR_ZERO(optstr) ?:
3280                         bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
3281                 if (!IS_ERR(optstr))
3282                         kfree(optstr);
3283
3284                 if (ret)
3285                         goto err;
3286         }
3287
3288         opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3289         opt_set(thr->opts, read_only, 1);
3290         opt_set(thr->opts, ratelimit_errors, 0);
3291
3292         /* We need request_key() to be called before we punt to kthread: */
3293         opt_set(thr->opts, nostart, true);
3294
3295         bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3296
3297         thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
3298
3299         if (!IS_ERR(thr->c) &&
3300             thr->c->opts.errors == BCH_ON_ERROR_panic)
3301                 thr->c->opts.errors = BCH_ON_ERROR_ro;
3302
3303         ret = __bch2_run_thread_with_stdio(&thr->thr);
3304 out:
3305         darray_for_each(devs, i)
3306                 kfree(*i);
3307         darray_exit(&devs);
3308         return ret;
3309 err:
3310         if (thr)
3311                 bch2_fsck_thread_exit(&thr->thr);
3312         pr_err("ret %s", bch2_err_str(ret));
3313         goto out;
3314 }
3315
3316 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3317 {
3318         struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3319         struct bch_fs *c = thr->c;
3320
3321         c->stdio_filter = current;
3322         c->stdio = &thr->thr.stdio;
3323
3324         /*
3325          * XXX: can we figure out a way to do this without mucking with c->opts?
3326          */
3327         unsigned old_fix_errors = c->opts.fix_errors;
3328         if (opt_defined(thr->opts, fix_errors))
3329                 c->opts.fix_errors = thr->opts.fix_errors;
3330         else
3331                 c->opts.fix_errors = FSCK_FIX_ask;
3332
3333         c->opts.fsck = true;
3334         set_bit(BCH_FS_fsck_running, &c->flags);
3335
3336         c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
3337         int ret = bch2_run_online_recovery_passes(c);
3338
3339         clear_bit(BCH_FS_fsck_running, &c->flags);
3340         bch_err_fn(c, ret);
3341
3342         c->stdio = NULL;
3343         c->stdio_filter = NULL;
3344         c->opts.fix_errors = old_fix_errors;
3345
3346         up(&c->online_fsck_mutex);
3347         bch2_ro_ref_put(c);
3348         return ret;
3349 }
3350
3351 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3352         .exit           = bch2_fsck_thread_exit,
3353         .fn             = bch2_fsck_online_thread_fn,
3354 };
3355
3356 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3357 {
3358         struct fsck_thread *thr = NULL;
3359         long ret = 0;
3360
3361         if (arg.flags)
3362                 return -EINVAL;
3363
3364         if (!capable(CAP_SYS_ADMIN))
3365                 return -EPERM;
3366
3367         if (!bch2_ro_ref_tryget(c))
3368                 return -EROFS;
3369
3370         if (down_trylock(&c->online_fsck_mutex)) {
3371                 bch2_ro_ref_put(c);
3372                 return -EAGAIN;
3373         }
3374
3375         thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3376         if (!thr) {
3377                 ret = -ENOMEM;
3378                 goto err;
3379         }
3380
3381         thr->c = c;
3382         thr->opts = bch2_opts_empty();
3383
3384         if (arg.opts) {
3385                 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3386
3387                 ret =   PTR_ERR_OR_ZERO(optstr) ?:
3388                         bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
3389                 if (!IS_ERR(optstr))
3390                         kfree(optstr);
3391
3392                 if (ret)
3393                         goto err;
3394         }
3395
3396         ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3397 err:
3398         if (ret < 0) {
3399                 bch_err_fn(c, ret);
3400                 if (thr)
3401                         bch2_fsck_thread_exit(&thr->thr);
3402                 up(&c->online_fsck_mutex);
3403                 bch2_ro_ref_put(c);
3404         }
3405         return ret;
3406 }
3407
3408 #endif /* NO_BCACHEFS_CHARDEV */
This page took 0.231846 seconds and 4 git commands to generate.