]> Git Repo - J-linux.git/blob - fs/bcachefs/fs.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / fs / bcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29 #include "trace.h"
30
31 #include <linux/aio.h>
32 #include <linux/backing-dev.h>
33 #include <linux/exportfs.h>
34 #include <linux/fiemap.h>
35 #include <linux/fs_context.h>
36 #include <linux/module.h>
37 #include <linux/pagemap.h>
38 #include <linux/posix_acl.h>
39 #include <linux/random.h>
40 #include <linux/seq_file.h>
41 #include <linux/statfs.h>
42 #include <linux/string.h>
43 #include <linux/xattr.h>
44
45 static struct kmem_cache *bch2_inode_cache;
46
47 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
48                                 struct bch_inode_info *,
49                                 struct bch_inode_unpacked *,
50                                 struct bch_subvolume *);
51
52 void bch2_inode_update_after_write(struct btree_trans *trans,
53                                    struct bch_inode_info *inode,
54                                    struct bch_inode_unpacked *bi,
55                                    unsigned fields)
56 {
57         struct bch_fs *c = trans->c;
58
59         BUG_ON(bi->bi_inum != inode->v.i_ino);
60
61         bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
62
63         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64         i_uid_write(&inode->v, bi->bi_uid);
65         i_gid_write(&inode->v, bi->bi_gid);
66         inode->v.i_mode = bi->bi_mode;
67
68         if (fields & ATTR_ATIME)
69                 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70         if (fields & ATTR_MTIME)
71                 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72         if (fields & ATTR_CTIME)
73                 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74
75         inode->ei_inode         = *bi;
76
77         bch2_inode_flags_to_vfs(inode);
78 }
79
80 int __must_check bch2_write_inode(struct bch_fs *c,
81                                   struct bch_inode_info *inode,
82                                   inode_set_fn set,
83                                   void *p, unsigned fields)
84 {
85         struct btree_trans *trans = bch2_trans_get(c);
86         struct btree_iter iter = { NULL };
87         struct bch_inode_unpacked inode_u;
88         int ret;
89 retry:
90         bch2_trans_begin(trans);
91
92         ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93                                 BTREE_ITER_intent) ?:
94                 (set ? set(trans, inode, &inode_u, p) : 0) ?:
95                 bch2_inode_write(trans, &iter, &inode_u) ?:
96                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97
98         /*
99          * the btree node lock protects inode->ei_inode, not ei_update_lock;
100          * this is important for inode updates via bchfs_write_index_update
101          */
102         if (!ret)
103                 bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104
105         bch2_trans_iter_exit(trans, &iter);
106
107         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108                 goto retry;
109
110         bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111                              "%s: inode %llu:%llu not found when updating",
112                              bch2_err_str(ret),
113                              inode_inum(inode).subvol,
114                              inode_inum(inode).inum);
115
116         bch2_trans_put(trans);
117         return ret < 0 ? ret : 0;
118 }
119
120 int bch2_fs_quota_transfer(struct bch_fs *c,
121                            struct bch_inode_info *inode,
122                            struct bch_qid new_qid,
123                            unsigned qtypes,
124                            enum quota_acct_mode mode)
125 {
126         unsigned i;
127         int ret;
128
129         qtypes &= enabled_qtypes(c);
130
131         for (i = 0; i < QTYP_NR; i++)
132                 if (new_qid.q[i] == inode->ei_qid.q[i])
133                         qtypes &= ~(1U << i);
134
135         if (!qtypes)
136                 return 0;
137
138         mutex_lock(&inode->ei_quota_lock);
139
140         ret = bch2_quota_transfer(c, qtypes, new_qid,
141                                   inode->ei_qid,
142                                   inode->v.i_blocks +
143                                   inode->ei_quota_reserved,
144                                   mode);
145         if (!ret)
146                 for (i = 0; i < QTYP_NR; i++)
147                         if (qtypes & (1 << i))
148                                 inode->ei_qid.q[i] = new_qid.q[i];
149
150         mutex_unlock(&inode->ei_quota_lock);
151
152         return ret;
153 }
154
155 static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
156 {
157         return a.subvol == b.subvol && a.inum == b.inum;
158 }
159
160 static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
161 {
162         const subvol_inum *inum = data;
163
164         return jhash(&inum->inum, sizeof(inum->inum), seed);
165 }
166
167 static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
168 {
169         const struct bch_inode_info *inode = data;
170
171         return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
172 }
173
174 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
175                                  const void *obj)
176 {
177         const struct bch_inode_info *inode = obj;
178         const subvol_inum *v = arg->key;
179
180         return !subvol_inum_eq(inode->ei_inum, *v);
181 }
182
183 static const struct rhashtable_params bch2_vfs_inodes_params = {
184         .head_offset            = offsetof(struct bch_inode_info, hash),
185         .key_offset             = offsetof(struct bch_inode_info, ei_inum),
186         .key_len                = sizeof(subvol_inum),
187         .hashfn                 = bch2_vfs_inode_hash_fn,
188         .obj_hashfn             = bch2_vfs_inode_obj_hash_fn,
189         .obj_cmpfn              = bch2_vfs_inode_cmp_fn,
190         .automatic_shrinking    = true,
191 };
192
193 int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
194 {
195         struct bch_fs *c = trans->c;
196         struct rhashtable *ht = &c->vfs_inodes_table;
197         subvol_inum inum = (subvol_inum) { .inum = p.offset };
198         DARRAY(u32) subvols;
199         int ret = 0;
200
201         if (!test_bit(BCH_FS_started, &c->flags))
202                 return false;
203
204         darray_init(&subvols);
205 restart_from_top:
206
207         /*
208          * Tweaked version of __rhashtable_lookup(); we need to get a list of
209          * subvolumes in which the given inode number is open.
210          *
211          * For this to work, we don't include the subvolume ID in the key that
212          * we hash - all inodes with the same inode number regardless of
213          * subvolume will hash to the same slot.
214          *
215          * This will be less than ideal if the same file is ever open
216          * simultaneously in many different snapshots:
217          */
218         rcu_read_lock();
219         struct rhash_lock_head __rcu *const *bkt;
220         struct rhash_head *he;
221         unsigned int hash;
222         struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
223 restart:
224         hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
225         bkt = rht_bucket(tbl, hash);
226         do {
227                 struct bch_inode_info *inode;
228
229                 rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
230                         if (inode->ei_inum.inum == inum.inum) {
231                                 ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
232                                                       GFP_NOWAIT|__GFP_NOWARN);
233                                 if (ret) {
234                                         rcu_read_unlock();
235                                         ret = darray_make_room(&subvols, 1);
236                                         if (ret)
237                                                 goto err;
238                                         subvols.nr = 0;
239                                         goto restart_from_top;
240                                 }
241                         }
242                 }
243                 /* An object might have been moved to a different hash chain,
244                  * while we walk along it - better check and retry.
245                  */
246         } while (he != RHT_NULLS_MARKER(bkt));
247
248         /* Ensure we see any new tables. */
249         smp_rmb();
250
251         tbl = rht_dereference_rcu(tbl->future_tbl, ht);
252         if (unlikely(tbl))
253                 goto restart;
254         rcu_read_unlock();
255
256         darray_for_each(subvols, i) {
257                 u32 snap;
258                 ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
259                 if (ret)
260                         goto err;
261
262                 ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
263                 if (ret)
264                         break;
265         }
266 err:
267         darray_exit(&subvols);
268         return ret;
269 }
270
271 static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
272 {
273         return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
274 }
275
276 static void __wait_on_freeing_inode(struct bch_fs *c,
277                                     struct bch_inode_info *inode,
278                                     subvol_inum inum)
279 {
280         wait_queue_head_t *wq;
281         struct wait_bit_queue_entry wait;
282
283         wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
284         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
285         spin_unlock(&inode->v.i_lock);
286
287         if (__bch2_inode_hash_find(c, inum) == inode)
288                 schedule_timeout(HZ * 10);
289         finish_wait(wq, &wait.wq_entry);
290 }
291
292 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
293                                                    subvol_inum inum)
294 {
295         struct bch_inode_info *inode;
296 repeat:
297         inode = __bch2_inode_hash_find(c, inum);
298         if (inode) {
299                 spin_lock(&inode->v.i_lock);
300                 if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
301                         spin_unlock(&inode->v.i_lock);
302                         return NULL;
303                 }
304                 if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
305                         if (!trans) {
306                                 __wait_on_freeing_inode(c, inode, inum);
307                         } else {
308                                 bch2_trans_unlock(trans);
309                                 __wait_on_freeing_inode(c, inode, inum);
310                                 int ret = bch2_trans_relock(trans);
311                                 if (ret)
312                                         return ERR_PTR(ret);
313                         }
314                         goto repeat;
315                 }
316                 __iget(&inode->v);
317                 spin_unlock(&inode->v.i_lock);
318         }
319
320         return inode;
321 }
322
323 static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
324 {
325         spin_lock(&inode->v.i_lock);
326         bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
327         spin_unlock(&inode->v.i_lock);
328
329         if (remove) {
330                 int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
331                                         &inode->hash, bch2_vfs_inodes_params);
332                 BUG_ON(ret);
333                 inode->v.i_hash.pprev = NULL;
334                 /*
335                  * This pairs with the bch2_inode_hash_find() ->
336                  * __wait_on_freeing_inode() path
337                  */
338                 inode_wake_up_bit(&inode->v, __I_NEW);
339         }
340 }
341
342 static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
343                                                      struct btree_trans *trans,
344                                                      struct bch_inode_info *inode)
345 {
346         struct bch_inode_info *old = inode;
347
348         set_bit(EI_INODE_HASHED, &inode->ei_flags);
349 retry:
350         if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
351                                         &inode->ei_inum,
352                                         &inode->hash,
353                                         bch2_vfs_inodes_params))) {
354                 old = bch2_inode_hash_find(c, trans, inode->ei_inum);
355                 if (!old)
356                         goto retry;
357
358                 clear_bit(EI_INODE_HASHED, &inode->ei_flags);
359
360                 /*
361                  * bcachefs doesn't use I_NEW; we have no use for it since we
362                  * only insert fully created inodes in the inode hash table. But
363                  * discard_new_inode() expects it to be set...
364                  */
365                 inode->v.i_state |= I_NEW;
366                 /*
367                  * We don't want bch2_evict_inode() to delete the inode on disk,
368                  * we just raced and had another inode in cache. Normally new
369                  * inodes don't have nlink == 0 - except tmpfiles do...
370                  */
371                 set_nlink(&inode->v, 1);
372                 discard_new_inode(&inode->v);
373                 return old;
374         } else {
375                 inode_fake_hash(&inode->v);
376
377                 inode_sb_list_add(&inode->v);
378
379                 mutex_lock(&c->vfs_inodes_lock);
380                 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
381                 mutex_unlock(&c->vfs_inodes_lock);
382                 return inode;
383         }
384 }
385
386 #define memalloc_flags_do(_flags, _do)                                          \
387 ({                                                                              \
388         unsigned _saved_flags = memalloc_flags_save(_flags);                    \
389         typeof(_do) _ret = _do;                                                 \
390         memalloc_noreclaim_restore(_saved_flags);                               \
391         _ret;                                                                   \
392 })
393
394 static struct inode *bch2_alloc_inode(struct super_block *sb)
395 {
396         BUG();
397 }
398
399 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
400 {
401         struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
402                                                 bch2_inode_cache, gfp);
403         if (!inode)
404                 return NULL;
405
406         inode_init_once(&inode->v);
407         mutex_init(&inode->ei_update_lock);
408         two_state_lock_init(&inode->ei_pagecache_lock);
409         INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
410         inode->ei_flags = 0;
411         mutex_init(&inode->ei_quota_lock);
412         memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
413
414         if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
415                 kmem_cache_free(bch2_inode_cache, inode);
416                 return NULL;
417         }
418
419         return inode;
420 }
421
422 /*
423  * Allocate a new inode, dropping/retaking btree locks if necessary:
424  */
425 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
426 {
427         struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
428
429         if (unlikely(!inode)) {
430                 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
431                 if (ret && inode) {
432                         __destroy_inode(&inode->v);
433                         kmem_cache_free(bch2_inode_cache, inode);
434                 }
435                 if (ret)
436                         return ERR_PTR(ret);
437         }
438
439         return inode;
440 }
441
442 static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
443                                                           subvol_inum inum,
444                                                           struct bch_inode_unpacked *bi,
445                                                           struct bch_subvolume *subvol)
446 {
447         struct bch_inode_info *inode = bch2_new_inode(trans);
448         if (IS_ERR(inode))
449                 return inode;
450
451         bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
452
453         return bch2_inode_hash_insert(trans->c, trans, inode);
454
455 }
456
457 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
458 {
459         struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
460         if (inode)
461                 return &inode->v;
462
463         struct btree_trans *trans = bch2_trans_get(c);
464
465         struct bch_inode_unpacked inode_u;
466         struct bch_subvolume subvol;
467         int ret = lockrestart_do(trans,
468                 bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
469                 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
470                 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
471         bch2_trans_put(trans);
472
473         return ret ? ERR_PTR(ret) : &inode->v;
474 }
475
476 struct bch_inode_info *
477 __bch2_create(struct mnt_idmap *idmap,
478               struct bch_inode_info *dir, struct dentry *dentry,
479               umode_t mode, dev_t rdev, subvol_inum snapshot_src,
480               unsigned flags)
481 {
482         struct bch_fs *c = dir->v.i_sb->s_fs_info;
483         struct btree_trans *trans;
484         struct bch_inode_unpacked dir_u;
485         struct bch_inode_info *inode;
486         struct bch_inode_unpacked inode_u;
487         struct posix_acl *default_acl = NULL, *acl = NULL;
488         subvol_inum inum;
489         struct bch_subvolume subvol;
490         u64 journal_seq = 0;
491         kuid_t kuid;
492         kgid_t kgid;
493         int ret;
494
495         /*
496          * preallocate acls + vfs inode before btree transaction, so that
497          * nothing can fail after the transaction succeeds:
498          */
499 #ifdef CONFIG_BCACHEFS_POSIX_ACL
500         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
501         if (ret)
502                 return ERR_PTR(ret);
503 #endif
504         inode = __bch2_new_inode(c, GFP_NOFS);
505         if (unlikely(!inode)) {
506                 inode = ERR_PTR(-ENOMEM);
507                 goto err;
508         }
509
510         bch2_inode_init_early(c, &inode_u);
511
512         if (!(flags & BCH_CREATE_TMPFILE))
513                 mutex_lock(&dir->ei_update_lock);
514
515         trans = bch2_trans_get(c);
516 retry:
517         bch2_trans_begin(trans);
518
519         kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
520         kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
521         ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
522                 bch2_create_trans(trans,
523                                   inode_inum(dir), &dir_u, &inode_u,
524                                   !(flags & BCH_CREATE_TMPFILE)
525                                   ? &dentry->d_name : NULL,
526                                   from_kuid(i_user_ns(&dir->v), kuid),
527                                   from_kgid(i_user_ns(&dir->v), kgid),
528                                   mode, rdev,
529                                   default_acl, acl, snapshot_src, flags) ?:
530                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
531                                 KEY_TYPE_QUOTA_PREALLOC);
532         if (unlikely(ret))
533                 goto err_before_quota;
534
535         inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
536         inum.inum = inode_u.bi_inum;
537
538         ret   = bch2_subvolume_get(trans, inum.subvol, true,
539                                    BTREE_ITER_with_updates, &subvol) ?:
540                 bch2_trans_commit(trans, NULL, &journal_seq, 0);
541         if (unlikely(ret)) {
542                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
543                                 KEY_TYPE_QUOTA_WARN);
544 err_before_quota:
545                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
546                         goto retry;
547                 goto err_trans;
548         }
549
550         if (!(flags & BCH_CREATE_TMPFILE)) {
551                 bch2_inode_update_after_write(trans, dir, &dir_u,
552                                               ATTR_MTIME|ATTR_CTIME);
553                 mutex_unlock(&dir->ei_update_lock);
554         }
555
556         bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
557
558         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
559         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
560
561         /*
562          * we must insert the new inode into the inode cache before calling
563          * bch2_trans_exit() and dropping locks, else we could race with another
564          * thread pulling the inode in and modifying it:
565          *
566          * also, calling bch2_inode_hash_insert() without passing in the
567          * transaction object is sketchy - if we could ever end up in
568          * __wait_on_freeing_inode(), we'd risk deadlock.
569          *
570          * But that shouldn't be possible, since we still have the inode locked
571          * that we just created, and we _really_ can't take a transaction
572          * restart here.
573          */
574         inode = bch2_inode_hash_insert(c, NULL, inode);
575         bch2_trans_put(trans);
576 err:
577         posix_acl_release(default_acl);
578         posix_acl_release(acl);
579         return inode;
580 err_trans:
581         if (!(flags & BCH_CREATE_TMPFILE))
582                 mutex_unlock(&dir->ei_update_lock);
583
584         bch2_trans_put(trans);
585         make_bad_inode(&inode->v);
586         iput(&inode->v);
587         inode = ERR_PTR(ret);
588         goto err;
589 }
590
591 /* methods */
592
593 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
594                         subvol_inum dir, struct bch_hash_info *dir_hash_info,
595                         const struct qstr *name)
596 {
597         struct bch_fs *c = trans->c;
598         struct btree_iter dirent_iter = {};
599         subvol_inum inum = {};
600         struct printbuf buf = PRINTBUF;
601
602         struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
603                                              dir_hash_info, dir, name, 0);
604         int ret = bkey_err(k);
605         if (ret)
606                 return ERR_PTR(ret);
607
608         ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
609         if (ret > 0)
610                 ret = -ENOENT;
611         if (ret)
612                 goto err;
613
614         struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
615         if (inode)
616                 goto out;
617
618         struct bch_subvolume subvol;
619         struct bch_inode_unpacked inode_u;
620         ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
621                 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
622                 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
623
624         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
625                                 c, "dirent to missing inode:\n  %s",
626                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
627         if (ret)
628                 goto err;
629
630         /* regular files may have hardlinks: */
631         if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
632                                     !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
633                                     c,
634                                     "dirent points to inode that does not point back:\n  %s",
635                                     (bch2_bkey_val_to_text(&buf, c, k),
636                                      prt_printf(&buf, "\n  "),
637                                      bch2_inode_unpacked_to_text(&buf, &inode_u),
638                                      buf.buf))) {
639                 ret = -ENOENT;
640                 goto err;
641         }
642 out:
643         bch2_trans_iter_exit(trans, &dirent_iter);
644         printbuf_exit(&buf);
645         return inode;
646 err:
647         inode = ERR_PTR(ret);
648         goto out;
649 }
650
651 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
652                                   unsigned int flags)
653 {
654         struct bch_fs *c = vdir->i_sb->s_fs_info;
655         struct bch_inode_info *dir = to_bch_ei(vdir);
656         struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
657
658         struct bch_inode_info *inode;
659         bch2_trans_do(c,
660                 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
661                                                           &hash, &dentry->d_name)));
662         if (IS_ERR(inode))
663                 inode = NULL;
664
665         return d_splice_alias(&inode->v, dentry);
666 }
667
668 static int bch2_mknod(struct mnt_idmap *idmap,
669                       struct inode *vdir, struct dentry *dentry,
670                       umode_t mode, dev_t rdev)
671 {
672         struct bch_inode_info *inode =
673                 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
674                               (subvol_inum) { 0 }, 0);
675
676         if (IS_ERR(inode))
677                 return bch2_err_class(PTR_ERR(inode));
678
679         d_instantiate(dentry, &inode->v);
680         return 0;
681 }
682
683 static int bch2_create(struct mnt_idmap *idmap,
684                        struct inode *vdir, struct dentry *dentry,
685                        umode_t mode, bool excl)
686 {
687         return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
688 }
689
690 static int __bch2_link(struct bch_fs *c,
691                        struct bch_inode_info *inode,
692                        struct bch_inode_info *dir,
693                        struct dentry *dentry)
694 {
695         struct bch_inode_unpacked dir_u, inode_u;
696         int ret;
697
698         mutex_lock(&inode->ei_update_lock);
699         struct btree_trans *trans = bch2_trans_get(c);
700
701         ret = commit_do(trans, NULL, NULL, 0,
702                         bch2_link_trans(trans,
703                                         inode_inum(dir),   &dir_u,
704                                         inode_inum(inode), &inode_u,
705                                         &dentry->d_name));
706
707         if (likely(!ret)) {
708                 bch2_inode_update_after_write(trans, dir, &dir_u,
709                                               ATTR_MTIME|ATTR_CTIME);
710                 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
711         }
712
713         bch2_trans_put(trans);
714         mutex_unlock(&inode->ei_update_lock);
715         return ret;
716 }
717
718 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
719                      struct dentry *dentry)
720 {
721         struct bch_fs *c = vdir->i_sb->s_fs_info;
722         struct bch_inode_info *dir = to_bch_ei(vdir);
723         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
724         int ret;
725
726         lockdep_assert_held(&inode->v.i_rwsem);
727
728         ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
729                 bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
730                 __bch2_link(c, inode, dir, dentry);
731         if (unlikely(ret))
732                 return bch2_err_class(ret);
733
734         ihold(&inode->v);
735         d_instantiate(dentry, &inode->v);
736         return 0;
737 }
738
739 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
740                   bool deleting_snapshot)
741 {
742         struct bch_fs *c = vdir->i_sb->s_fs_info;
743         struct bch_inode_info *dir = to_bch_ei(vdir);
744         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
745         struct bch_inode_unpacked dir_u, inode_u;
746         int ret;
747
748         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
749
750         struct btree_trans *trans = bch2_trans_get(c);
751
752         ret = commit_do(trans, NULL, NULL,
753                         BCH_TRANS_COMMIT_no_enospc,
754                 bch2_unlink_trans(trans,
755                                   inode_inum(dir), &dir_u,
756                                   &inode_u, &dentry->d_name,
757                                   deleting_snapshot));
758         if (unlikely(ret))
759                 goto err;
760
761         bch2_inode_update_after_write(trans, dir, &dir_u,
762                                       ATTR_MTIME|ATTR_CTIME);
763         bch2_inode_update_after_write(trans, inode, &inode_u,
764                                       ATTR_MTIME);
765
766         if (inode_u.bi_subvol) {
767                 /*
768                  * Subvolume deletion is asynchronous, but we still want to tell
769                  * the VFS that it's been deleted here:
770                  */
771                 set_nlink(&inode->v, 0);
772         }
773 err:
774         bch2_trans_put(trans);
775         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
776
777         return ret;
778 }
779
780 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
781 {
782         struct bch_inode_info *dir= to_bch_ei(vdir);
783         struct bch_fs *c = dir->v.i_sb->s_fs_info;
784
785         int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
786                 __bch2_unlink(vdir, dentry, false);
787         return bch2_err_class(ret);
788 }
789
790 static int bch2_symlink(struct mnt_idmap *idmap,
791                         struct inode *vdir, struct dentry *dentry,
792                         const char *symname)
793 {
794         struct bch_fs *c = vdir->i_sb->s_fs_info;
795         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
796         int ret;
797
798         inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
799                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
800         if (IS_ERR(inode))
801                 return bch2_err_class(PTR_ERR(inode));
802
803         inode_lock(&inode->v);
804         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
805         inode_unlock(&inode->v);
806
807         if (unlikely(ret))
808                 goto err;
809
810         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
811         if (unlikely(ret))
812                 goto err;
813
814         ret = __bch2_link(c, inode, dir, dentry);
815         if (unlikely(ret))
816                 goto err;
817
818         d_instantiate(dentry, &inode->v);
819         return 0;
820 err:
821         iput(&inode->v);
822         return bch2_err_class(ret);
823 }
824
825 static int bch2_mkdir(struct mnt_idmap *idmap,
826                       struct inode *vdir, struct dentry *dentry, umode_t mode)
827 {
828         return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
829 }
830
831 static int bch2_rename2(struct mnt_idmap *idmap,
832                         struct inode *src_vdir, struct dentry *src_dentry,
833                         struct inode *dst_vdir, struct dentry *dst_dentry,
834                         unsigned flags)
835 {
836         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
837         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
838         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
839         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
840         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
841         struct bch_inode_unpacked dst_dir_u, src_dir_u;
842         struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
843         struct btree_trans *trans;
844         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
845                 ? BCH_RENAME_EXCHANGE
846                 : dst_dentry->d_inode
847                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
848         bool whiteout = !!(flags & RENAME_WHITEOUT);
849         int ret;
850
851         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
852                 return -EINVAL;
853
854         if (mode == BCH_RENAME_OVERWRITE) {
855                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
856                                                    0, LLONG_MAX);
857                 if (ret)
858                         return ret;
859         }
860
861         bch2_lock_inodes(INODE_UPDATE_LOCK,
862                          src_dir,
863                          dst_dir,
864                          src_inode,
865                          dst_inode);
866
867         trans = bch2_trans_get(c);
868
869         ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
870                 bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
871         if (ret)
872                 goto err_tx_restart;
873
874         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
875                 ret = bch2_fs_quota_transfer(c, src_inode,
876                                              dst_dir->ei_qid,
877                                              1 << QTYP_PRJ,
878                                              KEY_TYPE_QUOTA_PREALLOC);
879                 if (ret)
880                         goto err;
881         }
882
883         if (mode == BCH_RENAME_EXCHANGE &&
884             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
885                 ret = bch2_fs_quota_transfer(c, dst_inode,
886                                              src_dir->ei_qid,
887                                              1 << QTYP_PRJ,
888                                              KEY_TYPE_QUOTA_PREALLOC);
889                 if (ret)
890                         goto err;
891         }
892 retry:
893         bch2_trans_begin(trans);
894
895         ret = bch2_rename_trans(trans,
896                                 inode_inum(src_dir), &src_dir_u,
897                                 inode_inum(dst_dir), &dst_dir_u,
898                                 &src_inode_u,
899                                 &dst_inode_u,
900                                 &src_dentry->d_name,
901                                 &dst_dentry->d_name,
902                                 mode);
903         if (unlikely(ret))
904                 goto err_tx_restart;
905
906         if (whiteout) {
907                 whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
908                 ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
909                 if (unlikely(ret))
910                         goto err_tx_restart;
911                 bch2_inode_init_early(c, whiteout_inode_u);
912
913                 ret = bch2_create_trans(trans,
914                                         inode_inum(src_dir), &src_dir_u,
915                                         whiteout_inode_u,
916                                         &src_dentry->d_name,
917                                         from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
918                                         from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
919                                         S_IFCHR|WHITEOUT_MODE, 0,
920                                         NULL, NULL, (subvol_inum) { 0 }, 0) ?:
921                       bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
922                                       KEY_TYPE_QUOTA_PREALLOC);
923                 if (unlikely(ret))
924                         goto err_tx_restart;
925         }
926
927         ret = bch2_trans_commit(trans, NULL, NULL, 0);
928         if (unlikely(ret)) {
929 err_tx_restart:
930                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
931                         goto retry;
932                 goto err;
933         }
934
935         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
936         BUG_ON(dst_inode &&
937                dst_inode->v.i_ino != dst_inode_u.bi_inum);
938
939         bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
940                                       ATTR_MTIME|ATTR_CTIME);
941
942         if (src_dir != dst_dir)
943                 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
944                                               ATTR_MTIME|ATTR_CTIME);
945
946         bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
947                                       ATTR_CTIME);
948
949         if (dst_inode)
950                 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
951                                               ATTR_CTIME);
952 err:
953         bch2_trans_put(trans);
954
955         bch2_fs_quota_transfer(c, src_inode,
956                                bch_qid(&src_inode->ei_inode),
957                                1 << QTYP_PRJ,
958                                KEY_TYPE_QUOTA_NOCHECK);
959         if (dst_inode)
960                 bch2_fs_quota_transfer(c, dst_inode,
961                                        bch_qid(&dst_inode->ei_inode),
962                                        1 << QTYP_PRJ,
963                                        KEY_TYPE_QUOTA_NOCHECK);
964
965         bch2_unlock_inodes(INODE_UPDATE_LOCK,
966                            src_dir,
967                            dst_dir,
968                            src_inode,
969                            dst_inode);
970
971         return bch2_err_class(ret);
972 }
973
974 static void bch2_setattr_copy(struct mnt_idmap *idmap,
975                               struct bch_inode_info *inode,
976                               struct bch_inode_unpacked *bi,
977                               struct iattr *attr)
978 {
979         struct bch_fs *c = inode->v.i_sb->s_fs_info;
980         unsigned int ia_valid = attr->ia_valid;
981         kuid_t kuid;
982         kgid_t kgid;
983
984         if (ia_valid & ATTR_UID) {
985                 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
986                 bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
987         }
988         if (ia_valid & ATTR_GID) {
989                 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
990                 bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
991         }
992
993         if (ia_valid & ATTR_SIZE)
994                 bi->bi_size = attr->ia_size;
995
996         if (ia_valid & ATTR_ATIME)
997                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
998         if (ia_valid & ATTR_MTIME)
999                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
1000         if (ia_valid & ATTR_CTIME)
1001                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
1002
1003         if (ia_valid & ATTR_MODE) {
1004                 umode_t mode = attr->ia_mode;
1005                 kgid_t gid = ia_valid & ATTR_GID
1006                         ? kgid
1007                         : inode->v.i_gid;
1008
1009                 if (!in_group_or_capable(idmap, &inode->v,
1010                         make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
1011                         mode &= ~S_ISGID;
1012                 bi->bi_mode = mode;
1013         }
1014 }
1015
1016 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
1017                          struct bch_inode_info *inode,
1018                          struct iattr *attr)
1019 {
1020         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1021         struct bch_qid qid;
1022         struct btree_trans *trans;
1023         struct btree_iter inode_iter = { NULL };
1024         struct bch_inode_unpacked inode_u;
1025         struct posix_acl *acl = NULL;
1026         kuid_t kuid;
1027         kgid_t kgid;
1028         int ret;
1029
1030         mutex_lock(&inode->ei_update_lock);
1031
1032         qid = inode->ei_qid;
1033
1034         if (attr->ia_valid & ATTR_UID) {
1035                 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
1036                 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
1037         }
1038
1039         if (attr->ia_valid & ATTR_GID) {
1040                 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
1041                 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
1042         }
1043
1044         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
1045                                      KEY_TYPE_QUOTA_PREALLOC);
1046         if (ret)
1047                 goto err;
1048
1049         trans = bch2_trans_get(c);
1050 retry:
1051         bch2_trans_begin(trans);
1052         kfree(acl);
1053         acl = NULL;
1054
1055         ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
1056                               BTREE_ITER_intent);
1057         if (ret)
1058                 goto btree_err;
1059
1060         bch2_setattr_copy(idmap, inode, &inode_u, attr);
1061
1062         if (attr->ia_valid & ATTR_MODE) {
1063                 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
1064                                      inode_u.bi_mode, &acl);
1065                 if (ret)
1066                         goto btree_err;
1067         }
1068
1069         ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
1070                 bch2_trans_commit(trans, NULL, NULL,
1071                                   BCH_TRANS_COMMIT_no_enospc);
1072 btree_err:
1073         bch2_trans_iter_exit(trans, &inode_iter);
1074
1075         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1076                 goto retry;
1077         if (unlikely(ret))
1078                 goto err_trans;
1079
1080         bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
1081
1082         if (acl)
1083                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
1084 err_trans:
1085         bch2_trans_put(trans);
1086 err:
1087         mutex_unlock(&inode->ei_update_lock);
1088
1089         return bch2_err_class(ret);
1090 }
1091
1092 static int bch2_getattr(struct mnt_idmap *idmap,
1093                         const struct path *path, struct kstat *stat,
1094                         u32 request_mask, unsigned query_flags)
1095 {
1096         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
1097         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1098         vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
1099         vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
1100
1101         stat->dev       = inode->v.i_sb->s_dev;
1102         stat->ino       = inode->v.i_ino;
1103         stat->mode      = inode->v.i_mode;
1104         stat->nlink     = inode->v.i_nlink;
1105         stat->uid       = vfsuid_into_kuid(vfsuid);
1106         stat->gid       = vfsgid_into_kgid(vfsgid);
1107         stat->rdev      = inode->v.i_rdev;
1108         stat->size      = i_size_read(&inode->v);
1109         stat->atime     = inode_get_atime(&inode->v);
1110         stat->mtime     = inode_get_mtime(&inode->v);
1111         stat->ctime     = inode_get_ctime(&inode->v);
1112         stat->blksize   = block_bytes(c);
1113         stat->blocks    = inode->v.i_blocks;
1114
1115         stat->subvol    = inode->ei_inum.subvol;
1116         stat->result_mask |= STATX_SUBVOL;
1117
1118         if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
1119                 stat->result_mask |= STATX_DIOALIGN;
1120                 /*
1121                  * this is incorrect; we should be tracking this in superblock,
1122                  * and checking the alignment of open devices
1123                  */
1124                 stat->dio_mem_align = SECTOR_SIZE;
1125                 stat->dio_offset_align = block_bytes(c);
1126         }
1127
1128         if (request_mask & STATX_BTIME) {
1129                 stat->result_mask |= STATX_BTIME;
1130                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
1131         }
1132
1133         if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
1134                 stat->attributes |= STATX_ATTR_IMMUTABLE;
1135         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
1136
1137         if (inode->ei_inode.bi_flags & BCH_INODE_append)
1138                 stat->attributes |= STATX_ATTR_APPEND;
1139         stat->attributes_mask    |= STATX_ATTR_APPEND;
1140
1141         if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
1142                 stat->attributes |= STATX_ATTR_NODUMP;
1143         stat->attributes_mask    |= STATX_ATTR_NODUMP;
1144
1145         return 0;
1146 }
1147
1148 static int bch2_setattr(struct mnt_idmap *idmap,
1149                         struct dentry *dentry, struct iattr *iattr)
1150 {
1151         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
1152         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1153         int ret;
1154
1155         lockdep_assert_held(&inode->v.i_rwsem);
1156
1157         ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
1158                 setattr_prepare(idmap, dentry, iattr);
1159         if (ret)
1160                 return ret;
1161
1162         return iattr->ia_valid & ATTR_SIZE
1163                 ? bchfs_truncate(idmap, inode, iattr)
1164                 : bch2_setattr_nonsize(idmap, inode, iattr);
1165 }
1166
1167 static int bch2_tmpfile(struct mnt_idmap *idmap,
1168                         struct inode *vdir, struct file *file, umode_t mode)
1169 {
1170         struct bch_inode_info *inode =
1171                 __bch2_create(idmap, to_bch_ei(vdir),
1172                               file->f_path.dentry, mode, 0,
1173                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
1174
1175         if (IS_ERR(inode))
1176                 return bch2_err_class(PTR_ERR(inode));
1177
1178         d_mark_tmpfile(file, &inode->v);
1179         d_instantiate(file->f_path.dentry, &inode->v);
1180         return finish_open_simple(file, 0);
1181 }
1182
1183 static int bch2_fill_extent(struct bch_fs *c,
1184                             struct fiemap_extent_info *info,
1185                             struct bkey_s_c k, unsigned flags)
1186 {
1187         if (bkey_extent_is_direct_data(k.k)) {
1188                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1189                 const union bch_extent_entry *entry;
1190                 struct extent_ptr_decoded p;
1191                 int ret;
1192
1193                 if (k.k->type == KEY_TYPE_reflink_v)
1194                         flags |= FIEMAP_EXTENT_SHARED;
1195
1196                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1197                         int flags2 = 0;
1198                         u64 offset = p.ptr.offset;
1199
1200                         if (p.ptr.unwritten)
1201                                 flags2 |= FIEMAP_EXTENT_UNWRITTEN;
1202
1203                         if (p.crc.compression_type)
1204                                 flags2 |= FIEMAP_EXTENT_ENCODED;
1205                         else
1206                                 offset += p.crc.offset;
1207
1208                         if ((offset & (block_sectors(c) - 1)) ||
1209                             (k.k->size & (block_sectors(c) - 1)))
1210                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
1211
1212                         ret = fiemap_fill_next_extent(info,
1213                                                 bkey_start_offset(k.k) << 9,
1214                                                 offset << 9,
1215                                                 k.k->size << 9, flags|flags2);
1216                         if (ret)
1217                                 return ret;
1218                 }
1219
1220                 return 0;
1221         } else if (bkey_extent_is_inline_data(k.k)) {
1222                 return fiemap_fill_next_extent(info,
1223                                                bkey_start_offset(k.k) << 9,
1224                                                0, k.k->size << 9,
1225                                                flags|
1226                                                FIEMAP_EXTENT_DATA_INLINE);
1227         } else if (k.k->type == KEY_TYPE_reservation) {
1228                 return fiemap_fill_next_extent(info,
1229                                                bkey_start_offset(k.k) << 9,
1230                                                0, k.k->size << 9,
1231                                                flags|
1232                                                FIEMAP_EXTENT_DELALLOC|
1233                                                FIEMAP_EXTENT_UNWRITTEN);
1234         } else {
1235                 BUG();
1236         }
1237 }
1238
1239 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
1240                        u64 start, u64 len)
1241 {
1242         struct bch_fs *c = vinode->i_sb->s_fs_info;
1243         struct bch_inode_info *ei = to_bch_ei(vinode);
1244         struct btree_trans *trans;
1245         struct btree_iter iter;
1246         struct bkey_s_c k;
1247         struct bkey_buf cur, prev;
1248         unsigned offset_into_extent, sectors;
1249         bool have_extent = false;
1250         int ret = 0;
1251
1252         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
1253         if (ret)
1254                 return ret;
1255
1256         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
1257         if (start + len < start)
1258                 return -EINVAL;
1259
1260         start >>= 9;
1261
1262         bch2_bkey_buf_init(&cur);
1263         bch2_bkey_buf_init(&prev);
1264         trans = bch2_trans_get(c);
1265
1266         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1267                              POS(ei->v.i_ino, start), 0);
1268
1269         while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1270                 enum btree_id data_btree = BTREE_ID_extents;
1271
1272                 bch2_trans_begin(trans);
1273
1274                 u32 snapshot;
1275                 ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
1276                 if (ret)
1277                         continue;
1278
1279                 bch2_btree_iter_set_snapshot(&iter, snapshot);
1280
1281                 k = bch2_btree_iter_peek_upto(&iter, end);
1282                 ret = bkey_err(k);
1283                 if (ret)
1284                         continue;
1285
1286                 if (!k.k)
1287                         break;
1288
1289                 if (!bkey_extent_is_data(k.k) &&
1290                     k.k->type != KEY_TYPE_reservation) {
1291                         bch2_btree_iter_advance(&iter);
1292                         continue;
1293                 }
1294
1295                 offset_into_extent      = iter.pos.offset -
1296                         bkey_start_offset(k.k);
1297                 sectors                 = k.k->size - offset_into_extent;
1298
1299                 bch2_bkey_buf_reassemble(&cur, c, k);
1300
1301                 ret = bch2_read_indirect_extent(trans, &data_btree,
1302                                         &offset_into_extent, &cur);
1303                 if (ret)
1304                         continue;
1305
1306                 k = bkey_i_to_s_c(cur.k);
1307                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1308
1309                 sectors = min(sectors, k.k->size - offset_into_extent);
1310
1311                 bch2_cut_front(POS(k.k->p.inode,
1312                                    bkey_start_offset(k.k) +
1313                                    offset_into_extent),
1314                                cur.k);
1315                 bch2_key_resize(&cur.k->k, sectors);
1316                 cur.k->k.p = iter.pos;
1317                 cur.k->k.p.offset += cur.k->k.size;
1318
1319                 if (have_extent) {
1320                         bch2_trans_unlock(trans);
1321                         ret = bch2_fill_extent(c, info,
1322                                         bkey_i_to_s_c(prev.k), 0);
1323                         if (ret)
1324                                 break;
1325                 }
1326
1327                 bkey_copy(prev.k, cur.k);
1328                 have_extent = true;
1329
1330                 bch2_btree_iter_set_pos(&iter,
1331                         POS(iter.pos.inode, iter.pos.offset + sectors));
1332         }
1333         bch2_trans_iter_exit(trans, &iter);
1334
1335         if (!ret && have_extent) {
1336                 bch2_trans_unlock(trans);
1337                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1338                                        FIEMAP_EXTENT_LAST);
1339         }
1340
1341         bch2_trans_put(trans);
1342         bch2_bkey_buf_exit(&cur, c);
1343         bch2_bkey_buf_exit(&prev, c);
1344         return ret < 0 ? ret : 0;
1345 }
1346
1347 static const struct vm_operations_struct bch_vm_ops = {
1348         .fault          = bch2_page_fault,
1349         .map_pages      = filemap_map_pages,
1350         .page_mkwrite   = bch2_page_mkwrite,
1351 };
1352
1353 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1354 {
1355         file_accessed(file);
1356
1357         vma->vm_ops = &bch_vm_ops;
1358         return 0;
1359 }
1360
1361 /* Directories: */
1362
1363 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1364 {
1365         return generic_file_llseek_size(file, offset, whence,
1366                                         S64_MAX, S64_MAX);
1367 }
1368
1369 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1370 {
1371         struct bch_inode_info *inode = file_bch_inode(file);
1372         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1373
1374         if (!dir_emit_dots(file, ctx))
1375                 return 0;
1376
1377         int ret = bch2_readdir(c, inode_inum(inode), ctx);
1378
1379         bch_err_fn(c, ret);
1380         return bch2_err_class(ret);
1381 }
1382
1383 static int bch2_open(struct inode *vinode, struct file *file)
1384 {
1385         if (file->f_flags & (O_WRONLY|O_RDWR)) {
1386                 struct bch_inode_info *inode = to_bch_ei(vinode);
1387                 struct bch_fs *c = inode->v.i_sb->s_fs_info;
1388
1389                 int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
1390                 if (ret)
1391                         return ret;
1392         }
1393
1394         file->f_mode |= FMODE_CAN_ODIRECT;
1395
1396         return generic_file_open(vinode, file);
1397 }
1398
1399 static const struct file_operations bch_file_operations = {
1400         .open           = bch2_open,
1401         .llseek         = bch2_llseek,
1402         .read_iter      = bch2_read_iter,
1403         .write_iter     = bch2_write_iter,
1404         .mmap           = bch2_mmap,
1405         .get_unmapped_area = thp_get_unmapped_area,
1406         .fsync          = bch2_fsync,
1407         .splice_read    = filemap_splice_read,
1408         .splice_write   = iter_file_splice_write,
1409         .fallocate      = bch2_fallocate_dispatch,
1410         .unlocked_ioctl = bch2_fs_file_ioctl,
1411 #ifdef CONFIG_COMPAT
1412         .compat_ioctl   = bch2_compat_fs_ioctl,
1413 #endif
1414         .remap_file_range = bch2_remap_file_range,
1415 };
1416
1417 static const struct inode_operations bch_file_inode_operations = {
1418         .getattr        = bch2_getattr,
1419         .setattr        = bch2_setattr,
1420         .fiemap         = bch2_fiemap,
1421         .listxattr      = bch2_xattr_list,
1422 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1423         .get_inode_acl  = bch2_get_acl,
1424         .set_acl        = bch2_set_acl,
1425 #endif
1426 };
1427
1428 static const struct inode_operations bch_dir_inode_operations = {
1429         .lookup         = bch2_lookup,
1430         .create         = bch2_create,
1431         .link           = bch2_link,
1432         .unlink         = bch2_unlink,
1433         .symlink        = bch2_symlink,
1434         .mkdir          = bch2_mkdir,
1435         .rmdir          = bch2_unlink,
1436         .mknod          = bch2_mknod,
1437         .rename         = bch2_rename2,
1438         .getattr        = bch2_getattr,
1439         .setattr        = bch2_setattr,
1440         .tmpfile        = bch2_tmpfile,
1441         .listxattr      = bch2_xattr_list,
1442 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1443         .get_inode_acl  = bch2_get_acl,
1444         .set_acl        = bch2_set_acl,
1445 #endif
1446 };
1447
1448 static const struct file_operations bch_dir_file_operations = {
1449         .llseek         = bch2_dir_llseek,
1450         .read           = generic_read_dir,
1451         .iterate_shared = bch2_vfs_readdir,
1452         .fsync          = bch2_fsync,
1453         .unlocked_ioctl = bch2_fs_file_ioctl,
1454 #ifdef CONFIG_COMPAT
1455         .compat_ioctl   = bch2_compat_fs_ioctl,
1456 #endif
1457 };
1458
1459 static const struct inode_operations bch_symlink_inode_operations = {
1460         .get_link       = page_get_link,
1461         .getattr        = bch2_getattr,
1462         .setattr        = bch2_setattr,
1463         .listxattr      = bch2_xattr_list,
1464 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1465         .get_inode_acl  = bch2_get_acl,
1466         .set_acl        = bch2_set_acl,
1467 #endif
1468 };
1469
1470 static const struct inode_operations bch_special_inode_operations = {
1471         .getattr        = bch2_getattr,
1472         .setattr        = bch2_setattr,
1473         .listxattr      = bch2_xattr_list,
1474 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1475         .get_inode_acl  = bch2_get_acl,
1476         .set_acl        = bch2_set_acl,
1477 #endif
1478 };
1479
1480 static const struct address_space_operations bch_address_space_operations = {
1481         .read_folio     = bch2_read_folio,
1482         .writepages     = bch2_writepages,
1483         .readahead      = bch2_readahead,
1484         .dirty_folio    = filemap_dirty_folio,
1485         .write_begin    = bch2_write_begin,
1486         .write_end      = bch2_write_end,
1487         .invalidate_folio = bch2_invalidate_folio,
1488         .release_folio  = bch2_release_folio,
1489 #ifdef CONFIG_MIGRATION
1490         .migrate_folio  = filemap_migrate_folio,
1491 #endif
1492         .error_remove_folio = generic_error_remove_folio,
1493 };
1494
1495 struct bcachefs_fid {
1496         u64             inum;
1497         u32             subvol;
1498         u32             gen;
1499 } __packed;
1500
1501 struct bcachefs_fid_with_parent {
1502         struct bcachefs_fid     fid;
1503         struct bcachefs_fid     dir;
1504 } __packed;
1505
1506 static int bcachefs_fid_valid(int fh_len, int fh_type)
1507 {
1508         switch (fh_type) {
1509         case FILEID_BCACHEFS_WITHOUT_PARENT:
1510                 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1511         case FILEID_BCACHEFS_WITH_PARENT:
1512                 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1513         default:
1514                 return false;
1515         }
1516 }
1517
1518 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1519 {
1520         return (struct bcachefs_fid) {
1521                 .inum   = inode->ei_inum.inum,
1522                 .subvol = inode->ei_inum.subvol,
1523                 .gen    = inode->ei_inode.bi_generation,
1524         };
1525 }
1526
1527 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1528                           struct inode *vdir)
1529 {
1530         struct bch_inode_info *inode    = to_bch_ei(vinode);
1531         struct bch_inode_info *dir      = to_bch_ei(vdir);
1532         int min_len;
1533
1534         if (!S_ISDIR(inode->v.i_mode) && dir) {
1535                 struct bcachefs_fid_with_parent *fid = (void *) fh;
1536
1537                 min_len = sizeof(*fid) / sizeof(u32);
1538                 if (*len < min_len) {
1539                         *len = min_len;
1540                         return FILEID_INVALID;
1541                 }
1542
1543                 fid->fid = bch2_inode_to_fid(inode);
1544                 fid->dir = bch2_inode_to_fid(dir);
1545
1546                 *len = min_len;
1547                 return FILEID_BCACHEFS_WITH_PARENT;
1548         } else {
1549                 struct bcachefs_fid *fid = (void *) fh;
1550
1551                 min_len = sizeof(*fid) / sizeof(u32);
1552                 if (*len < min_len) {
1553                         *len = min_len;
1554                         return FILEID_INVALID;
1555                 }
1556                 *fid = bch2_inode_to_fid(inode);
1557
1558                 *len = min_len;
1559                 return FILEID_BCACHEFS_WITHOUT_PARENT;
1560         }
1561 }
1562
1563 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1564                                         struct bcachefs_fid fid)
1565 {
1566         struct bch_fs *c = sb->s_fs_info;
1567         struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1568                                     .subvol = fid.subvol,
1569                                     .inum = fid.inum,
1570         });
1571         if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1572                 iput(vinode);
1573                 vinode = ERR_PTR(-ESTALE);
1574         }
1575         return vinode;
1576 }
1577
1578 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1579                 int fh_len, int fh_type)
1580 {
1581         struct bcachefs_fid *fid = (void *) _fid;
1582
1583         if (!bcachefs_fid_valid(fh_len, fh_type))
1584                 return NULL;
1585
1586         return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1587 }
1588
1589 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1590                 int fh_len, int fh_type)
1591 {
1592         struct bcachefs_fid_with_parent *fid = (void *) _fid;
1593
1594         if (!bcachefs_fid_valid(fh_len, fh_type) ||
1595             fh_type != FILEID_BCACHEFS_WITH_PARENT)
1596                 return NULL;
1597
1598         return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1599 }
1600
1601 static struct dentry *bch2_get_parent(struct dentry *child)
1602 {
1603         struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1604         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1605         subvol_inum parent_inum = {
1606                 .subvol = inode->ei_inode.bi_parent_subvol ?:
1607                         inode->ei_inum.subvol,
1608                 .inum = inode->ei_inode.bi_dir,
1609         };
1610
1611         return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1612 }
1613
1614 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1615 {
1616         struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
1617         struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
1618         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1619         struct btree_trans *trans;
1620         struct btree_iter iter1;
1621         struct btree_iter iter2;
1622         struct bkey_s_c k;
1623         struct bkey_s_c_dirent d;
1624         struct bch_inode_unpacked inode_u;
1625         subvol_inum target;
1626         u32 snapshot;
1627         struct qstr dirent_name;
1628         unsigned name_len = 0;
1629         int ret;
1630
1631         if (!S_ISDIR(dir->v.i_mode))
1632                 return -EINVAL;
1633
1634         trans = bch2_trans_get(c);
1635
1636         bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1637                              POS(dir->ei_inode.bi_inum, 0), 0);
1638         bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1639                              POS(dir->ei_inode.bi_inum, 0), 0);
1640 retry:
1641         bch2_trans_begin(trans);
1642
1643         ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
1644         if (ret)
1645                 goto err;
1646
1647         bch2_btree_iter_set_snapshot(&iter1, snapshot);
1648         bch2_btree_iter_set_snapshot(&iter2, snapshot);
1649
1650         ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1651         if (ret)
1652                 goto err;
1653
1654         if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1655                 bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1656
1657                 k = bch2_btree_iter_peek_slot(&iter1);
1658                 ret = bkey_err(k);
1659                 if (ret)
1660                         goto err;
1661
1662                 if (k.k->type != KEY_TYPE_dirent) {
1663                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1664                         goto err;
1665                 }
1666
1667                 d = bkey_s_c_to_dirent(k);
1668                 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1669                 if (ret > 0)
1670                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1671                 if (ret)
1672                         goto err;
1673
1674                 if (subvol_inum_eq(target, inode->ei_inum))
1675                         goto found;
1676         } else {
1677                 /*
1678                  * File with multiple hardlinks and our backref is to the wrong
1679                  * directory - linear search:
1680                  */
1681                 for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1682                         if (k.k->p.inode > dir->ei_inode.bi_inum)
1683                                 break;
1684
1685                         if (k.k->type != KEY_TYPE_dirent)
1686                                 continue;
1687
1688                         d = bkey_s_c_to_dirent(k);
1689                         ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1690                         if (ret < 0)
1691                                 break;
1692                         if (ret)
1693                                 continue;
1694
1695                         if (subvol_inum_eq(target, inode->ei_inum))
1696                                 goto found;
1697                 }
1698         }
1699
1700         ret = -ENOENT;
1701         goto err;
1702 found:
1703         dirent_name = bch2_dirent_get_name(d);
1704
1705         name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1706         memcpy(name, dirent_name.name, name_len);
1707         name[name_len] = '\0';
1708 err:
1709         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1710                 goto retry;
1711
1712         bch2_trans_iter_exit(trans, &iter1);
1713         bch2_trans_iter_exit(trans, &iter2);
1714         bch2_trans_put(trans);
1715
1716         return ret;
1717 }
1718
1719 static const struct export_operations bch_export_ops = {
1720         .encode_fh      = bch2_encode_fh,
1721         .fh_to_dentry   = bch2_fh_to_dentry,
1722         .fh_to_parent   = bch2_fh_to_parent,
1723         .get_parent     = bch2_get_parent,
1724         .get_name       = bch2_get_name,
1725 };
1726
1727 static void bch2_vfs_inode_init(struct btree_trans *trans,
1728                                 subvol_inum inum,
1729                                 struct bch_inode_info *inode,
1730                                 struct bch_inode_unpacked *bi,
1731                                 struct bch_subvolume *subvol)
1732 {
1733         inode->v.i_ino          = inum.inum;
1734         inode->ei_inum          = inum;
1735         inode->ei_inode.bi_inum = inum.inum;
1736         bch2_inode_update_after_write(trans, inode, bi, ~0);
1737
1738         inode->v.i_blocks       = bi->bi_sectors;
1739         inode->v.i_ino          = bi->bi_inum;
1740         inode->v.i_rdev         = bi->bi_dev;
1741         inode->v.i_generation   = bi->bi_generation;
1742         inode->v.i_size         = bi->bi_size;
1743
1744         inode->ei_flags         = 0;
1745         inode->ei_quota_reserved = 0;
1746         inode->ei_qid           = bch_qid(bi);
1747
1748         if (BCH_SUBVOLUME_SNAP(subvol))
1749                 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1750
1751         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1752
1753         switch (inode->v.i_mode & S_IFMT) {
1754         case S_IFREG:
1755                 inode->v.i_op   = &bch_file_inode_operations;
1756                 inode->v.i_fop  = &bch_file_operations;
1757                 break;
1758         case S_IFDIR:
1759                 inode->v.i_op   = &bch_dir_inode_operations;
1760                 inode->v.i_fop  = &bch_dir_file_operations;
1761                 break;
1762         case S_IFLNK:
1763                 inode_nohighmem(&inode->v);
1764                 inode->v.i_op   = &bch_symlink_inode_operations;
1765                 break;
1766         default:
1767                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1768                 inode->v.i_op   = &bch_special_inode_operations;
1769                 break;
1770         }
1771
1772         mapping_set_large_folios(inode->v.i_mapping);
1773 }
1774
1775 static void bch2_free_inode(struct inode *vinode)
1776 {
1777         kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
1778 }
1779
1780 static int inode_update_times_fn(struct btree_trans *trans,
1781                                  struct bch_inode_info *inode,
1782                                  struct bch_inode_unpacked *bi,
1783                                  void *p)
1784 {
1785         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1786
1787         bi->bi_atime    = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1788         bi->bi_mtime    = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1789         bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1790
1791         return 0;
1792 }
1793
1794 static int bch2_vfs_write_inode(struct inode *vinode,
1795                                 struct writeback_control *wbc)
1796 {
1797         struct bch_fs *c = vinode->i_sb->s_fs_info;
1798         struct bch_inode_info *inode = to_bch_ei(vinode);
1799         int ret;
1800
1801         mutex_lock(&inode->ei_update_lock);
1802         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1803                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1804         mutex_unlock(&inode->ei_update_lock);
1805
1806         return bch2_err_class(ret);
1807 }
1808
1809 static void bch2_evict_inode(struct inode *vinode)
1810 {
1811         struct bch_fs *c = vinode->i_sb->s_fs_info;
1812         struct bch_inode_info *inode = to_bch_ei(vinode);
1813         bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
1814
1815         /*
1816          * evict() has waited for outstanding writeback, we'll do no more IO
1817          * through this inode: it's safe to remove from VFS inode hashtable here
1818          *
1819          * Do that now so that other threads aren't blocked from pulling it back
1820          * in, there's no reason for them to be:
1821          */
1822         if (!delete)
1823                 bch2_inode_hash_remove(c, inode);
1824
1825         truncate_inode_pages_final(&inode->v.i_data);
1826
1827         clear_inode(&inode->v);
1828
1829         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1830
1831         if (delete) {
1832                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1833                                 KEY_TYPE_QUOTA_WARN);
1834                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1835                                 KEY_TYPE_QUOTA_WARN);
1836                 bch2_inode_rm(c, inode_inum(inode));
1837
1838                 /*
1839                  * If we are deleting, we need it present in the vfs hash table
1840                  * so that fsck can check if unlinked inodes are still open:
1841                  */
1842                 bch2_inode_hash_remove(c, inode);
1843         }
1844
1845         mutex_lock(&c->vfs_inodes_lock);
1846         list_del_init(&inode->ei_vfs_inode_list);
1847         mutex_unlock(&c->vfs_inodes_lock);
1848 }
1849
1850 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1851 {
1852         struct bch_inode_info *inode;
1853         DARRAY(struct bch_inode_info *) grabbed;
1854         bool clean_pass = false, this_pass_clean;
1855
1856         /*
1857          * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1858          * be pruned with d_mark_dontcache().
1859          *
1860          * Once we've had a clean pass where we didn't find any inodes without
1861          * I_DONTCACHE, we wait for them to be freed:
1862          */
1863
1864         darray_init(&grabbed);
1865         darray_make_room(&grabbed, 1024);
1866 again:
1867         cond_resched();
1868         this_pass_clean = true;
1869
1870         mutex_lock(&c->vfs_inodes_lock);
1871         list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1872                 if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
1873                         continue;
1874
1875                 if (!(inode->v.i_state & I_DONTCACHE) &&
1876                     !(inode->v.i_state & I_FREEING) &&
1877                     igrab(&inode->v)) {
1878                         this_pass_clean = false;
1879
1880                         if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1881                                 iput(&inode->v);
1882                                 break;
1883                         }
1884                 } else if (clean_pass && this_pass_clean) {
1885                         struct wait_bit_queue_entry wqe;
1886                         struct wait_queue_head *wq_head;
1887
1888                         wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
1889                         prepare_to_wait_event(wq_head, &wqe.wq_entry,
1890                                               TASK_UNINTERRUPTIBLE);
1891                         mutex_unlock(&c->vfs_inodes_lock);
1892
1893                         schedule();
1894                         finish_wait(wq_head, &wqe.wq_entry);
1895                         goto again;
1896                 }
1897         }
1898         mutex_unlock(&c->vfs_inodes_lock);
1899
1900         darray_for_each(grabbed, i) {
1901                 inode = *i;
1902                 d_mark_dontcache(&inode->v);
1903                 d_prune_aliases(&inode->v);
1904                 iput(&inode->v);
1905         }
1906         grabbed.nr = 0;
1907
1908         if (!clean_pass || !this_pass_clean) {
1909                 clean_pass = this_pass_clean;
1910                 goto again;
1911         }
1912
1913         darray_exit(&grabbed);
1914 }
1915
1916 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1917 {
1918         struct super_block *sb = dentry->d_sb;
1919         struct bch_fs *c = sb->s_fs_info;
1920         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1921         unsigned shift = sb->s_blocksize_bits - 9;
1922         /*
1923          * this assumes inodes take up 64 bytes, which is a decent average
1924          * number:
1925          */
1926         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1927
1928         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1929         buf->f_bsize    = sb->s_blocksize;
1930         buf->f_blocks   = usage.capacity >> shift;
1931         buf->f_bfree    = usage.free >> shift;
1932         buf->f_bavail   = avail_factor(usage.free) >> shift;
1933
1934         buf->f_files    = usage.nr_inodes + avail_inodes;
1935         buf->f_ffree    = avail_inodes;
1936
1937         buf->f_fsid     = uuid_to_fsid(c->sb.user_uuid.b);
1938         buf->f_namelen  = BCH_NAME_MAX;
1939
1940         return 0;
1941 }
1942
1943 static int bch2_sync_fs(struct super_block *sb, int wait)
1944 {
1945         struct bch_fs *c = sb->s_fs_info;
1946         int ret;
1947
1948         trace_bch2_sync_fs(sb, wait);
1949
1950         if (c->opts.journal_flush_disabled)
1951                 return 0;
1952
1953         if (!wait) {
1954                 bch2_journal_flush_async(&c->journal, NULL);
1955                 return 0;
1956         }
1957
1958         ret = bch2_journal_flush(&c->journal);
1959         return bch2_err_class(ret);
1960 }
1961
1962 static struct bch_fs *bch2_path_to_fs(const char *path)
1963 {
1964         struct bch_fs *c;
1965         dev_t dev;
1966         int ret;
1967
1968         ret = lookup_bdev(path, &dev);
1969         if (ret)
1970                 return ERR_PTR(ret);
1971
1972         c = bch2_dev_to_fs(dev);
1973         if (c)
1974                 closure_put(&c->cl);
1975         return c ?: ERR_PTR(-ENOENT);
1976 }
1977
1978 static int bch2_remount(struct super_block *sb, int *flags,
1979                         struct bch_opts opts)
1980 {
1981         struct bch_fs *c = sb->s_fs_info;
1982         int ret = 0;
1983
1984         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1985
1986         if (opts.read_only != c->opts.read_only) {
1987                 down_write(&c->state_lock);
1988
1989                 if (opts.read_only) {
1990                         bch2_fs_read_only(c);
1991
1992                         sb->s_flags |= SB_RDONLY;
1993                 } else {
1994                         ret = bch2_fs_read_write(c);
1995                         if (ret) {
1996                                 bch_err(c, "error going rw: %i", ret);
1997                                 up_write(&c->state_lock);
1998                                 ret = -EINVAL;
1999                                 goto err;
2000                         }
2001
2002                         sb->s_flags &= ~SB_RDONLY;
2003                 }
2004
2005                 c->opts.read_only = opts.read_only;
2006
2007                 up_write(&c->state_lock);
2008         }
2009
2010         if (opt_defined(opts, errors))
2011                 c->opts.errors = opts.errors;
2012 err:
2013         return bch2_err_class(ret);
2014 }
2015
2016 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
2017 {
2018         struct bch_fs *c = root->d_sb->s_fs_info;
2019         bool first = true;
2020
2021         for_each_online_member(c, ca) {
2022                 if (!first)
2023                         seq_putc(seq, ':');
2024                 first = false;
2025                 seq_puts(seq, ca->disk_sb.sb_name);
2026         }
2027
2028         return 0;
2029 }
2030
2031 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
2032 {
2033         struct bch_fs *c = root->d_sb->s_fs_info;
2034         struct printbuf buf = PRINTBUF;
2035
2036         bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
2037                           OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
2038         printbuf_nul_terminate(&buf);
2039         seq_printf(seq, ",%s", buf.buf);
2040
2041         int ret = buf.allocation_failure ? -ENOMEM : 0;
2042         printbuf_exit(&buf);
2043         return ret;
2044 }
2045
2046 static void bch2_put_super(struct super_block *sb)
2047 {
2048         struct bch_fs *c = sb->s_fs_info;
2049
2050         __bch2_fs_stop(c);
2051 }
2052
2053 /*
2054  * bcachefs doesn't currently integrate intwrite freeze protection but the
2055  * internal write references serve the same purpose. Therefore reuse the
2056  * read-only transition code to perform the quiesce. The caveat is that we don't
2057  * currently have the ability to block tasks that want a write reference while
2058  * the superblock is frozen. This is fine for now, but we should either add
2059  * blocking support or find a way to integrate sb_start_intwrite() and friends.
2060  */
2061 static int bch2_freeze(struct super_block *sb)
2062 {
2063         struct bch_fs *c = sb->s_fs_info;
2064
2065         down_write(&c->state_lock);
2066         bch2_fs_read_only(c);
2067         up_write(&c->state_lock);
2068         return 0;
2069 }
2070
2071 static int bch2_unfreeze(struct super_block *sb)
2072 {
2073         struct bch_fs *c = sb->s_fs_info;
2074         int ret;
2075
2076         if (test_bit(BCH_FS_emergency_ro, &c->flags))
2077                 return 0;
2078
2079         down_write(&c->state_lock);
2080         ret = bch2_fs_read_write(c);
2081         up_write(&c->state_lock);
2082         return ret;
2083 }
2084
2085 static const struct super_operations bch_super_operations = {
2086         .alloc_inode    = bch2_alloc_inode,
2087         .free_inode     = bch2_free_inode,
2088         .write_inode    = bch2_vfs_write_inode,
2089         .evict_inode    = bch2_evict_inode,
2090         .sync_fs        = bch2_sync_fs,
2091         .statfs         = bch2_statfs,
2092         .show_devname   = bch2_show_devname,
2093         .show_options   = bch2_show_options,
2094         .put_super      = bch2_put_super,
2095         .freeze_fs      = bch2_freeze,
2096         .unfreeze_fs    = bch2_unfreeze,
2097 };
2098
2099 static int bch2_set_super(struct super_block *s, void *data)
2100 {
2101         s->s_fs_info = data;
2102         return 0;
2103 }
2104
2105 static int bch2_noset_super(struct super_block *s, void *data)
2106 {
2107         return -EBUSY;
2108 }
2109
2110 typedef DARRAY(struct bch_fs *) darray_fs;
2111
2112 static int bch2_test_super(struct super_block *s, void *data)
2113 {
2114         struct bch_fs *c = s->s_fs_info;
2115         darray_fs *d = data;
2116
2117         if (!c)
2118                 return false;
2119
2120         darray_for_each(*d, i)
2121                 if (c != *i)
2122                         return false;
2123         return true;
2124 }
2125
2126 static int bch2_fs_get_tree(struct fs_context *fc)
2127 {
2128         struct bch_fs *c;
2129         struct super_block *sb;
2130         struct inode *vinode;
2131         struct bch2_opts_parse *opts_parse = fc->fs_private;
2132         struct bch_opts opts = opts_parse->opts;
2133         darray_str devs;
2134         darray_fs devs_to_fs = {};
2135         int ret;
2136
2137         opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
2138         opt_set(opts, nostart, true);
2139
2140         if (!fc->source || strlen(fc->source) == 0)
2141                 return -EINVAL;
2142
2143         ret = bch2_split_devs(fc->source, &devs);
2144         if (ret)
2145                 return ret;
2146
2147         darray_for_each(devs, i) {
2148                 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
2149                 if (ret)
2150                         goto err;
2151         }
2152
2153         sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
2154         if (!IS_ERR(sb))
2155                 goto got_sb;
2156
2157         c = bch2_fs_open(devs.data, devs.nr, opts);
2158         ret = PTR_ERR_OR_ZERO(c);
2159         if (ret)
2160                 goto err;
2161
2162         /* Some options can't be parsed until after the fs is started: */
2163         opts = bch2_opts_empty();
2164         ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
2165         if (ret)
2166                 goto err_stop_fs;
2167
2168         bch2_opts_apply(&c->opts, opts);
2169
2170         ret = bch2_fs_start(c);
2171         if (ret)
2172                 goto err_stop_fs;
2173
2174         sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
2175         ret = PTR_ERR_OR_ZERO(sb);
2176         if (ret)
2177                 goto err_stop_fs;
2178 got_sb:
2179         c = sb->s_fs_info;
2180
2181         if (sb->s_root) {
2182                 if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
2183                         ret = -EBUSY;
2184                         goto err_put_super;
2185                 }
2186                 goto out;
2187         }
2188
2189         sb->s_blocksize         = block_bytes(c);
2190         sb->s_blocksize_bits    = ilog2(block_bytes(c));
2191         sb->s_maxbytes          = MAX_LFS_FILESIZE;
2192         sb->s_op                = &bch_super_operations;
2193         sb->s_export_op         = &bch_export_ops;
2194 #ifdef CONFIG_BCACHEFS_QUOTA
2195         sb->s_qcop              = &bch2_quotactl_operations;
2196         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
2197 #endif
2198         sb->s_xattr             = bch2_xattr_handlers;
2199         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
2200         sb->s_time_gran         = c->sb.nsec_per_time_unit;
2201         sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
2202         sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
2203         sb->s_uuid              = c->sb.user_uuid;
2204         sb->s_shrink->seeks     = 0;
2205         c->vfs_sb               = sb;
2206         strscpy(sb->s_id, c->name, sizeof(sb->s_id));
2207
2208         ret = super_setup_bdi(sb);
2209         if (ret)
2210                 goto err_put_super;
2211
2212         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
2213
2214         for_each_online_member(c, ca) {
2215                 struct block_device *bdev = ca->disk_sb.bdev;
2216
2217                 /* XXX: create an anonymous device for multi device filesystems */
2218                 sb->s_bdev      = bdev;
2219                 sb->s_dev       = bdev->bd_dev;
2220                 percpu_ref_put(&ca->io_ref);
2221                 break;
2222         }
2223
2224         c->dev = sb->s_dev;
2225
2226 #ifdef CONFIG_BCACHEFS_POSIX_ACL
2227         if (c->opts.acl)
2228                 sb->s_flags     |= SB_POSIXACL;
2229 #endif
2230
2231         sb->s_shrink->seeks = 0;
2232
2233         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
2234         ret = PTR_ERR_OR_ZERO(vinode);
2235         bch_err_msg(c, ret, "mounting: error getting root inode");
2236         if (ret)
2237                 goto err_put_super;
2238
2239         sb->s_root = d_make_root(vinode);
2240         if (!sb->s_root) {
2241                 bch_err(c, "error mounting: error allocating root dentry");
2242                 ret = -ENOMEM;
2243                 goto err_put_super;
2244         }
2245
2246         sb->s_flags |= SB_ACTIVE;
2247 out:
2248         fc->root = dget(sb->s_root);
2249 err:
2250         darray_exit(&devs_to_fs);
2251         bch2_darray_str_exit(&devs);
2252         if (ret)
2253                 pr_err("error: %s", bch2_err_str(ret));
2254         /*
2255          * On an inconsistency error in recovery we might see an -EROFS derived
2256          * errorcode (from the journal), but we don't want to return that to
2257          * userspace as that causes util-linux to retry the mount RO - which is
2258          * confusing:
2259          */
2260         if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
2261                 ret = -EIO;
2262         return bch2_err_class(ret);
2263
2264 err_stop_fs:
2265         bch2_fs_stop(c);
2266         goto err;
2267
2268 err_put_super:
2269         __bch2_fs_stop(c);
2270         deactivate_locked_super(sb);
2271         goto err;
2272 }
2273
2274 static void bch2_kill_sb(struct super_block *sb)
2275 {
2276         struct bch_fs *c = sb->s_fs_info;
2277
2278         generic_shutdown_super(sb);
2279         bch2_fs_free(c);
2280 }
2281
2282 static void bch2_fs_context_free(struct fs_context *fc)
2283 {
2284         struct bch2_opts_parse *opts = fc->fs_private;
2285
2286         if (opts) {
2287                 printbuf_exit(&opts->parse_later);
2288                 kfree(opts);
2289         }
2290 }
2291
2292 static int bch2_fs_parse_param(struct fs_context *fc,
2293                                struct fs_parameter *param)
2294 {
2295         /*
2296          * the "source" param, i.e., the name of the device(s) to mount,
2297          * is handled by the VFS layer.
2298          */
2299         if (!strcmp(param->key, "source"))
2300                 return -ENOPARAM;
2301
2302         struct bch2_opts_parse *opts = fc->fs_private;
2303         struct bch_fs *c = NULL;
2304
2305         /* for reconfigure, we already have a struct bch_fs */
2306         if (fc->root)
2307                 c = fc->root->d_sb->s_fs_info;
2308
2309         int ret = bch2_parse_one_mount_opt(c, &opts->opts,
2310                                            &opts->parse_later, param->key,
2311                                            param->string);
2312
2313         return bch2_err_class(ret);
2314 }
2315
2316 static int bch2_fs_reconfigure(struct fs_context *fc)
2317 {
2318         struct super_block *sb = fc->root->d_sb;
2319         struct bch2_opts_parse *opts = fc->fs_private;
2320
2321         return bch2_remount(sb, &fc->sb_flags, opts->opts);
2322 }
2323
2324 static const struct fs_context_operations bch2_context_ops = {
2325         .free        = bch2_fs_context_free,
2326         .parse_param = bch2_fs_parse_param,
2327         .get_tree    = bch2_fs_get_tree,
2328         .reconfigure = bch2_fs_reconfigure,
2329 };
2330
2331 static int bch2_init_fs_context(struct fs_context *fc)
2332 {
2333         struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
2334
2335         if (!opts)
2336                 return -ENOMEM;
2337
2338         opts->parse_later = PRINTBUF;
2339
2340         fc->ops = &bch2_context_ops;
2341         fc->fs_private = opts;
2342
2343         return 0;
2344 }
2345
2346 void bch2_fs_vfs_exit(struct bch_fs *c)
2347 {
2348         if (c->vfs_inodes_table.tbl)
2349                 rhashtable_destroy(&c->vfs_inodes_table);
2350 }
2351
2352 int bch2_fs_vfs_init(struct bch_fs *c)
2353 {
2354         return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
2355 }
2356
2357 static struct file_system_type bcache_fs_type = {
2358         .owner                  = THIS_MODULE,
2359         .name                   = "bcachefs",
2360         .init_fs_context        = bch2_init_fs_context,
2361         .kill_sb                = bch2_kill_sb,
2362         .fs_flags               = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
2363 };
2364
2365 MODULE_ALIAS_FS("bcachefs");
2366
2367 void bch2_vfs_exit(void)
2368 {
2369         unregister_filesystem(&bcache_fs_type);
2370         kmem_cache_destroy(bch2_inode_cache);
2371 }
2372
2373 int __init bch2_vfs_init(void)
2374 {
2375         int ret = -ENOMEM;
2376
2377         bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
2378                                       SLAB_ACCOUNT);
2379         if (!bch2_inode_cache)
2380                 goto err;
2381
2382         ret = register_filesystem(&bcache_fs_type);
2383         if (ret)
2384                 goto err;
2385
2386         return 0;
2387 err:
2388         bch2_vfs_exit();
2389         return ret;
2390 }
2391
2392 #endif /* NO_BCACHEFS_FS */
This page took 0.159273 seconds and 4 git commands to generate.