]> Git Repo - linux.git/blob - fs/bcachefs/fs.c
Linux 6.14-rc3
[linux.git] / fs / bcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "rebalance.h"
27 #include "snapshot.h"
28 #include "super.h"
29 #include "xattr.h"
30 #include "trace.h"
31
32 #include <linux/aio.h>
33 #include <linux/backing-dev.h>
34 #include <linux/exportfs.h>
35 #include <linux/fiemap.h>
36 #include <linux/fs_context.h>
37 #include <linux/module.h>
38 #include <linux/pagemap.h>
39 #include <linux/posix_acl.h>
40 #include <linux/random.h>
41 #include <linux/seq_file.h>
42 #include <linux/siphash.h>
43 #include <linux/statfs.h>
44 #include <linux/string.h>
45 #include <linux/xattr.h>
46
47 static struct kmem_cache *bch2_inode_cache;
48
49 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
50                                 struct bch_inode_info *,
51                                 struct bch_inode_unpacked *,
52                                 struct bch_subvolume *);
53
54 void bch2_inode_update_after_write(struct btree_trans *trans,
55                                    struct bch_inode_info *inode,
56                                    struct bch_inode_unpacked *bi,
57                                    unsigned fields)
58 {
59         struct bch_fs *c = trans->c;
60
61         BUG_ON(bi->bi_inum != inode->v.i_ino);
62
63         bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
64
65         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
66         i_uid_write(&inode->v, bi->bi_uid);
67         i_gid_write(&inode->v, bi->bi_gid);
68         inode->v.i_mode = bi->bi_mode;
69
70         if (fields & ATTR_SIZE)
71                 i_size_write(&inode->v, bi->bi_size);
72
73         if (fields & ATTR_ATIME)
74                 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
75         if (fields & ATTR_MTIME)
76                 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
77         if (fields & ATTR_CTIME)
78                 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
79
80         inode->ei_inode         = *bi;
81
82         bch2_inode_flags_to_vfs(inode);
83 }
84
85 int __must_check bch2_write_inode(struct bch_fs *c,
86                                   struct bch_inode_info *inode,
87                                   inode_set_fn set,
88                                   void *p, unsigned fields)
89 {
90         struct btree_trans *trans = bch2_trans_get(c);
91         struct btree_iter iter = { NULL };
92         struct bch_inode_unpacked inode_u;
93         int ret;
94 retry:
95         bch2_trans_begin(trans);
96
97         ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
98         if (ret)
99                 goto err;
100
101         struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
102
103         ret = (set ? set(trans, inode, &inode_u, p) : 0);
104         if (ret)
105                 goto err;
106
107         struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
108
109         if (memcmp(&old_r, &new_r, sizeof(new_r))) {
110                 ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
111                 if (ret)
112                         goto err;
113         }
114
115         ret   = bch2_inode_write(trans, &iter, &inode_u) ?:
116                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
117
118         /*
119          * the btree node lock protects inode->ei_inode, not ei_update_lock;
120          * this is important for inode updates via bchfs_write_index_update
121          */
122         if (!ret)
123                 bch2_inode_update_after_write(trans, inode, &inode_u, fields);
124 err:
125         bch2_trans_iter_exit(trans, &iter);
126
127         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
128                 goto retry;
129
130         bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
131                              "%s: inode %llu:%llu not found when updating",
132                              bch2_err_str(ret),
133                              inode_inum(inode).subvol,
134                              inode_inum(inode).inum);
135
136         bch2_trans_put(trans);
137         return ret < 0 ? ret : 0;
138 }
139
140 int bch2_fs_quota_transfer(struct bch_fs *c,
141                            struct bch_inode_info *inode,
142                            struct bch_qid new_qid,
143                            unsigned qtypes,
144                            enum quota_acct_mode mode)
145 {
146         unsigned i;
147         int ret;
148
149         qtypes &= enabled_qtypes(c);
150
151         for (i = 0; i < QTYP_NR; i++)
152                 if (new_qid.q[i] == inode->ei_qid.q[i])
153                         qtypes &= ~(1U << i);
154
155         if (!qtypes)
156                 return 0;
157
158         mutex_lock(&inode->ei_quota_lock);
159
160         ret = bch2_quota_transfer(c, qtypes, new_qid,
161                                   inode->ei_qid,
162                                   inode->v.i_blocks +
163                                   inode->ei_quota_reserved,
164                                   mode);
165         if (!ret)
166                 for (i = 0; i < QTYP_NR; i++)
167                         if (qtypes & (1 << i))
168                                 inode->ei_qid.q[i] = new_qid.q[i];
169
170         mutex_unlock(&inode->ei_quota_lock);
171
172         return ret;
173 }
174
175 static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
176 {
177         return a.subvol == b.subvol && a.inum == b.inum;
178 }
179
180 static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
181 {
182         const subvol_inum *inum = data;
183         siphash_key_t k = { .key[0] = seed };
184
185         return siphash_2u64(inum->subvol, inum->inum, &k);
186 }
187
188 static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
189 {
190         const struct bch_inode_info *inode = data;
191
192         return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
193 }
194
195 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
196                                  const void *obj)
197 {
198         const struct bch_inode_info *inode = obj;
199         const subvol_inum *v = arg->key;
200
201         return !subvol_inum_eq(inode->ei_inum, *v);
202 }
203
204 static const struct rhashtable_params bch2_vfs_inodes_params = {
205         .head_offset            = offsetof(struct bch_inode_info, hash),
206         .key_offset             = offsetof(struct bch_inode_info, ei_inum),
207         .key_len                = sizeof(subvol_inum),
208         .hashfn                 = bch2_vfs_inode_hash_fn,
209         .obj_hashfn             = bch2_vfs_inode_obj_hash_fn,
210         .obj_cmpfn              = bch2_vfs_inode_cmp_fn,
211         .automatic_shrinking    = true,
212 };
213
214 static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
215         .head_offset            = offsetof(struct bch_inode_info, by_inum_hash),
216         .key_offset             = offsetof(struct bch_inode_info, ei_inum.inum),
217         .key_len                = sizeof(u64),
218         .automatic_shrinking    = true,
219 };
220
221 int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
222 {
223         struct bch_fs *c = trans->c;
224         struct rhltable *ht = &c->vfs_inodes_by_inum_table;
225         u64 inum = p.offset;
226         DARRAY(u32) subvols;
227         int ret = 0;
228
229         if (!test_bit(BCH_FS_started, &c->flags))
230                 return false;
231
232         darray_init(&subvols);
233 restart_from_top:
234
235         /*
236          * Tweaked version of __rhashtable_lookup(); we need to get a list of
237          * subvolumes in which the given inode number is open.
238          *
239          * For this to work, we don't include the subvolume ID in the key that
240          * we hash - all inodes with the same inode number regardless of
241          * subvolume will hash to the same slot.
242          *
243          * This will be less than ideal if the same file is ever open
244          * simultaneously in many different snapshots:
245          */
246         rcu_read_lock();
247         struct rhash_lock_head __rcu *const *bkt;
248         struct rhash_head *he;
249         unsigned int hash;
250         struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
251 restart:
252         hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
253         bkt = rht_bucket(tbl, hash);
254         do {
255                 struct bch_inode_info *inode;
256
257                 rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
258                         if (inode->ei_inum.inum == inum) {
259                                 ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
260                                                       GFP_NOWAIT|__GFP_NOWARN);
261                                 if (ret) {
262                                         rcu_read_unlock();
263                                         ret = darray_make_room(&subvols, 1);
264                                         if (ret)
265                                                 goto err;
266                                         subvols.nr = 0;
267                                         goto restart_from_top;
268                                 }
269                         }
270                 }
271                 /* An object might have been moved to a different hash chain,
272                  * while we walk along it - better check and retry.
273                  */
274         } while (he != RHT_NULLS_MARKER(bkt));
275
276         /* Ensure we see any new tables. */
277         smp_rmb();
278
279         tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
280         if (unlikely(tbl))
281                 goto restart;
282         rcu_read_unlock();
283
284         darray_for_each(subvols, i) {
285                 u32 snap;
286                 ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
287                 if (ret)
288                         goto err;
289
290                 ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
291                 if (ret)
292                         break;
293         }
294 err:
295         darray_exit(&subvols);
296         return ret;
297 }
298
299 static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
300 {
301         return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
302 }
303
304 static void __wait_on_freeing_inode(struct bch_fs *c,
305                                     struct bch_inode_info *inode,
306                                     subvol_inum inum)
307 {
308         wait_queue_head_t *wq;
309         struct wait_bit_queue_entry wait;
310
311         wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
312         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
313         spin_unlock(&inode->v.i_lock);
314
315         if (__bch2_inode_hash_find(c, inum) == inode)
316                 schedule_timeout(HZ * 10);
317         finish_wait(wq, &wait.wq_entry);
318 }
319
320 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
321                                                    subvol_inum inum)
322 {
323         struct bch_inode_info *inode;
324 repeat:
325         inode = __bch2_inode_hash_find(c, inum);
326         if (inode) {
327                 spin_lock(&inode->v.i_lock);
328                 if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
329                         spin_unlock(&inode->v.i_lock);
330                         return NULL;
331                 }
332                 if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
333                         if (!trans) {
334                                 __wait_on_freeing_inode(c, inode, inum);
335                         } else {
336                                 bch2_trans_unlock(trans);
337                                 __wait_on_freeing_inode(c, inode, inum);
338                                 int ret = bch2_trans_relock(trans);
339                                 if (ret)
340                                         return ERR_PTR(ret);
341                         }
342                         goto repeat;
343                 }
344                 __iget(&inode->v);
345                 spin_unlock(&inode->v.i_lock);
346         }
347
348         return inode;
349 }
350
351 static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
352 {
353         spin_lock(&inode->v.i_lock);
354         bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
355         spin_unlock(&inode->v.i_lock);
356
357         if (remove) {
358                 int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
359                                         &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
360                 BUG_ON(ret);
361
362                 ret = rhashtable_remove_fast(&c->vfs_inodes_table,
363                                         &inode->hash, bch2_vfs_inodes_params);
364                 BUG_ON(ret);
365                 inode->v.i_hash.pprev = NULL;
366                 /*
367                  * This pairs with the bch2_inode_hash_find() ->
368                  * __wait_on_freeing_inode() path
369                  */
370                 inode_wake_up_bit(&inode->v, __I_NEW);
371         }
372 }
373
374 static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
375                                                      struct btree_trans *trans,
376                                                      struct bch_inode_info *inode)
377 {
378         struct bch_inode_info *old = inode;
379
380         set_bit(EI_INODE_HASHED, &inode->ei_flags);
381 retry:
382         if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
383                                         &inode->ei_inum,
384                                         &inode->hash,
385                                         bch2_vfs_inodes_params))) {
386                 old = bch2_inode_hash_find(c, trans, inode->ei_inum);
387                 if (!old)
388                         goto retry;
389
390                 clear_bit(EI_INODE_HASHED, &inode->ei_flags);
391
392                 /*
393                  * bcachefs doesn't use I_NEW; we have no use for it since we
394                  * only insert fully created inodes in the inode hash table. But
395                  * discard_new_inode() expects it to be set...
396                  */
397                 inode->v.i_state |= I_NEW;
398                 /*
399                  * We don't want bch2_evict_inode() to delete the inode on disk,
400                  * we just raced and had another inode in cache. Normally new
401                  * inodes don't have nlink == 0 - except tmpfiles do...
402                  */
403                 set_nlink(&inode->v, 1);
404                 discard_new_inode(&inode->v);
405                 return old;
406         } else {
407                 int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
408                                           &inode->by_inum_hash,
409                                           bch2_vfs_inodes_by_inum_params);
410                 BUG_ON(ret);
411
412                 inode_fake_hash(&inode->v);
413
414                 inode_sb_list_add(&inode->v);
415
416                 mutex_lock(&c->vfs_inodes_lock);
417                 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
418                 mutex_unlock(&c->vfs_inodes_lock);
419                 return inode;
420         }
421 }
422
423 #define memalloc_flags_do(_flags, _do)                                          \
424 ({                                                                              \
425         unsigned _saved_flags = memalloc_flags_save(_flags);                    \
426         typeof(_do) _ret = _do;                                                 \
427         memalloc_noreclaim_restore(_saved_flags);                               \
428         _ret;                                                                   \
429 })
430
431 static struct inode *bch2_alloc_inode(struct super_block *sb)
432 {
433         BUG();
434 }
435
436 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
437 {
438         struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
439                                                 bch2_inode_cache, gfp);
440         if (!inode)
441                 return NULL;
442
443         inode_init_once(&inode->v);
444         mutex_init(&inode->ei_update_lock);
445         two_state_lock_init(&inode->ei_pagecache_lock);
446         INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
447         inode->ei_flags = 0;
448         mutex_init(&inode->ei_quota_lock);
449         memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
450
451         if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
452                 kmem_cache_free(bch2_inode_cache, inode);
453                 return NULL;
454         }
455
456         return inode;
457 }
458
459 /*
460  * Allocate a new inode, dropping/retaking btree locks if necessary:
461  */
462 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
463 {
464         struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
465
466         if (unlikely(!inode)) {
467                 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
468                 if (ret && inode) {
469                         __destroy_inode(&inode->v);
470                         kmem_cache_free(bch2_inode_cache, inode);
471                 }
472                 if (ret)
473                         return ERR_PTR(ret);
474         }
475
476         return inode;
477 }
478
479 static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
480                                                           subvol_inum inum,
481                                                           struct bch_inode_unpacked *bi,
482                                                           struct bch_subvolume *subvol)
483 {
484         struct bch_inode_info *inode = bch2_new_inode(trans);
485         if (IS_ERR(inode))
486                 return inode;
487
488         bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
489
490         return bch2_inode_hash_insert(trans->c, trans, inode);
491
492 }
493
494 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
495 {
496         struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
497         if (inode)
498                 return &inode->v;
499
500         struct btree_trans *trans = bch2_trans_get(c);
501
502         struct bch_inode_unpacked inode_u;
503         struct bch_subvolume subvol;
504         int ret = lockrestart_do(trans,
505                 bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
506                 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
507                 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
508         bch2_trans_put(trans);
509
510         return ret ? ERR_PTR(ret) : &inode->v;
511 }
512
513 struct bch_inode_info *
514 __bch2_create(struct mnt_idmap *idmap,
515               struct bch_inode_info *dir, struct dentry *dentry,
516               umode_t mode, dev_t rdev, subvol_inum snapshot_src,
517               unsigned flags)
518 {
519         struct bch_fs *c = dir->v.i_sb->s_fs_info;
520         struct btree_trans *trans;
521         struct bch_inode_unpacked dir_u;
522         struct bch_inode_info *inode;
523         struct bch_inode_unpacked inode_u;
524         struct posix_acl *default_acl = NULL, *acl = NULL;
525         subvol_inum inum;
526         struct bch_subvolume subvol;
527         u64 journal_seq = 0;
528         kuid_t kuid;
529         kgid_t kgid;
530         int ret;
531
532         /*
533          * preallocate acls + vfs inode before btree transaction, so that
534          * nothing can fail after the transaction succeeds:
535          */
536 #ifdef CONFIG_BCACHEFS_POSIX_ACL
537         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
538         if (ret)
539                 return ERR_PTR(ret);
540 #endif
541         inode = __bch2_new_inode(c, GFP_NOFS);
542         if (unlikely(!inode)) {
543                 inode = ERR_PTR(-ENOMEM);
544                 goto err;
545         }
546
547         bch2_inode_init_early(c, &inode_u);
548
549         if (!(flags & BCH_CREATE_TMPFILE))
550                 mutex_lock(&dir->ei_update_lock);
551
552         trans = bch2_trans_get(c);
553 retry:
554         bch2_trans_begin(trans);
555
556         kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
557         kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
558         ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
559                 bch2_create_trans(trans,
560                                   inode_inum(dir), &dir_u, &inode_u,
561                                   !(flags & BCH_CREATE_TMPFILE)
562                                   ? &dentry->d_name : NULL,
563                                   from_kuid(i_user_ns(&dir->v), kuid),
564                                   from_kgid(i_user_ns(&dir->v), kgid),
565                                   mode, rdev,
566                                   default_acl, acl, snapshot_src, flags) ?:
567                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
568                                 KEY_TYPE_QUOTA_PREALLOC);
569         if (unlikely(ret))
570                 goto err_before_quota;
571
572         inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
573         inum.inum = inode_u.bi_inum;
574
575         ret   = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
576                 bch2_trans_commit(trans, NULL, &journal_seq, 0);
577         if (unlikely(ret)) {
578                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
579                                 KEY_TYPE_QUOTA_WARN);
580 err_before_quota:
581                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
582                         goto retry;
583                 goto err_trans;
584         }
585
586         if (!(flags & BCH_CREATE_TMPFILE)) {
587                 bch2_inode_update_after_write(trans, dir, &dir_u,
588                                               ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
589                 mutex_unlock(&dir->ei_update_lock);
590         }
591
592         bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
593
594         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
595         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
596
597         /*
598          * we must insert the new inode into the inode cache before calling
599          * bch2_trans_exit() and dropping locks, else we could race with another
600          * thread pulling the inode in and modifying it:
601          *
602          * also, calling bch2_inode_hash_insert() without passing in the
603          * transaction object is sketchy - if we could ever end up in
604          * __wait_on_freeing_inode(), we'd risk deadlock.
605          *
606          * But that shouldn't be possible, since we still have the inode locked
607          * that we just created, and we _really_ can't take a transaction
608          * restart here.
609          */
610         inode = bch2_inode_hash_insert(c, NULL, inode);
611         bch2_trans_put(trans);
612 err:
613         posix_acl_release(default_acl);
614         posix_acl_release(acl);
615         return inode;
616 err_trans:
617         if (!(flags & BCH_CREATE_TMPFILE))
618                 mutex_unlock(&dir->ei_update_lock);
619
620         bch2_trans_put(trans);
621         make_bad_inode(&inode->v);
622         iput(&inode->v);
623         inode = ERR_PTR(ret);
624         goto err;
625 }
626
627 /* methods */
628
629 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
630                         subvol_inum dir, struct bch_hash_info *dir_hash_info,
631                         const struct qstr *name)
632 {
633         struct bch_fs *c = trans->c;
634         struct btree_iter dirent_iter = {};
635         subvol_inum inum = {};
636         struct printbuf buf = PRINTBUF;
637
638         struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
639                                              dir_hash_info, dir, name, 0);
640         int ret = bkey_err(k);
641         if (ret)
642                 return ERR_PTR(ret);
643
644         ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
645         if (ret > 0)
646                 ret = -ENOENT;
647         if (ret)
648                 goto err;
649
650         struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
651         if (inode)
652                 goto out;
653
654         struct bch_subvolume subvol;
655         struct bch_inode_unpacked inode_u;
656         ret =   bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
657                 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
658                 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
659
660         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
661                                 c, "dirent to missing inode:\n  %s",
662                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
663         if (ret)
664                 goto err;
665
666         /* regular files may have hardlinks: */
667         if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
668                                     !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
669                                     c,
670                                     "dirent points to inode that does not point back:\n  %s",
671                                     (bch2_bkey_val_to_text(&buf, c, k),
672                                      prt_printf(&buf, "\n  "),
673                                      bch2_inode_unpacked_to_text(&buf, &inode_u),
674                                      buf.buf))) {
675                 ret = -ENOENT;
676                 goto err;
677         }
678 out:
679         bch2_trans_iter_exit(trans, &dirent_iter);
680         printbuf_exit(&buf);
681         return inode;
682 err:
683         inode = ERR_PTR(ret);
684         goto out;
685 }
686
687 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
688                                   unsigned int flags)
689 {
690         struct bch_fs *c = vdir->i_sb->s_fs_info;
691         struct bch_inode_info *dir = to_bch_ei(vdir);
692         struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
693
694         struct bch_inode_info *inode;
695         bch2_trans_do(c,
696                 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
697                                                           &hash, &dentry->d_name)));
698         if (IS_ERR(inode))
699                 inode = NULL;
700
701         return d_splice_alias(&inode->v, dentry);
702 }
703
704 static int bch2_mknod(struct mnt_idmap *idmap,
705                       struct inode *vdir, struct dentry *dentry,
706                       umode_t mode, dev_t rdev)
707 {
708         struct bch_inode_info *inode =
709                 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
710                               (subvol_inum) { 0 }, 0);
711
712         if (IS_ERR(inode))
713                 return bch2_err_class(PTR_ERR(inode));
714
715         d_instantiate(dentry, &inode->v);
716         return 0;
717 }
718
719 static int bch2_create(struct mnt_idmap *idmap,
720                        struct inode *vdir, struct dentry *dentry,
721                        umode_t mode, bool excl)
722 {
723         return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
724 }
725
726 static int __bch2_link(struct bch_fs *c,
727                        struct bch_inode_info *inode,
728                        struct bch_inode_info *dir,
729                        struct dentry *dentry)
730 {
731         struct bch_inode_unpacked dir_u, inode_u;
732         int ret;
733
734         mutex_lock(&inode->ei_update_lock);
735         struct btree_trans *trans = bch2_trans_get(c);
736
737         ret = commit_do(trans, NULL, NULL, 0,
738                         bch2_link_trans(trans,
739                                         inode_inum(dir),   &dir_u,
740                                         inode_inum(inode), &inode_u,
741                                         &dentry->d_name));
742
743         if (likely(!ret)) {
744                 bch2_inode_update_after_write(trans, dir, &dir_u,
745                                               ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
746                 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
747         }
748
749         bch2_trans_put(trans);
750         mutex_unlock(&inode->ei_update_lock);
751         return ret;
752 }
753
754 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
755                      struct dentry *dentry)
756 {
757         struct bch_fs *c = vdir->i_sb->s_fs_info;
758         struct bch_inode_info *dir = to_bch_ei(vdir);
759         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
760         int ret;
761
762         lockdep_assert_held(&inode->v.i_rwsem);
763
764         ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
765                 bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
766                 __bch2_link(c, inode, dir, dentry);
767         if (unlikely(ret))
768                 return bch2_err_class(ret);
769
770         ihold(&inode->v);
771         d_instantiate(dentry, &inode->v);
772         return 0;
773 }
774
775 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
776                   bool deleting_snapshot)
777 {
778         struct bch_fs *c = vdir->i_sb->s_fs_info;
779         struct bch_inode_info *dir = to_bch_ei(vdir);
780         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
781         struct bch_inode_unpacked dir_u, inode_u;
782         int ret;
783
784         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
785
786         struct btree_trans *trans = bch2_trans_get(c);
787
788         ret = commit_do(trans, NULL, NULL,
789                         BCH_TRANS_COMMIT_no_enospc,
790                 bch2_unlink_trans(trans,
791                                   inode_inum(dir), &dir_u,
792                                   &inode_u, &dentry->d_name,
793                                   deleting_snapshot));
794         if (unlikely(ret))
795                 goto err;
796
797         bch2_inode_update_after_write(trans, dir, &dir_u,
798                                       ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
799         bch2_inode_update_after_write(trans, inode, &inode_u,
800                                       ATTR_MTIME);
801
802         if (inode_u.bi_subvol) {
803                 /*
804                  * Subvolume deletion is asynchronous, but we still want to tell
805                  * the VFS that it's been deleted here:
806                  */
807                 set_nlink(&inode->v, 0);
808         }
809 err:
810         bch2_trans_put(trans);
811         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
812
813         return ret;
814 }
815
816 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
817 {
818         struct bch_inode_info *dir= to_bch_ei(vdir);
819         struct bch_fs *c = dir->v.i_sb->s_fs_info;
820
821         int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
822                 __bch2_unlink(vdir, dentry, false);
823         return bch2_err_class(ret);
824 }
825
826 static int bch2_symlink(struct mnt_idmap *idmap,
827                         struct inode *vdir, struct dentry *dentry,
828                         const char *symname)
829 {
830         struct bch_fs *c = vdir->i_sb->s_fs_info;
831         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
832         int ret;
833
834         inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
835                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
836         if (IS_ERR(inode))
837                 return bch2_err_class(PTR_ERR(inode));
838
839         inode_lock(&inode->v);
840         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
841         inode_unlock(&inode->v);
842
843         if (unlikely(ret))
844                 goto err;
845
846         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
847         if (unlikely(ret))
848                 goto err;
849
850         ret = __bch2_link(c, inode, dir, dentry);
851         if (unlikely(ret))
852                 goto err;
853
854         d_instantiate(dentry, &inode->v);
855         return 0;
856 err:
857         iput(&inode->v);
858         return bch2_err_class(ret);
859 }
860
861 static int bch2_mkdir(struct mnt_idmap *idmap,
862                       struct inode *vdir, struct dentry *dentry, umode_t mode)
863 {
864         return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
865 }
866
867 static int bch2_rename2(struct mnt_idmap *idmap,
868                         struct inode *src_vdir, struct dentry *src_dentry,
869                         struct inode *dst_vdir, struct dentry *dst_dentry,
870                         unsigned flags)
871 {
872         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
873         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
874         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
875         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
876         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
877         struct bch_inode_unpacked dst_dir_u, src_dir_u;
878         struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
879         struct btree_trans *trans;
880         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
881                 ? BCH_RENAME_EXCHANGE
882                 : dst_dentry->d_inode
883                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
884         bool whiteout = !!(flags & RENAME_WHITEOUT);
885         int ret;
886
887         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
888                 return -EINVAL;
889
890         if (mode == BCH_RENAME_OVERWRITE) {
891                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
892                                                    0, LLONG_MAX);
893                 if (ret)
894                         return ret;
895         }
896
897         bch2_lock_inodes(INODE_UPDATE_LOCK,
898                          src_dir,
899                          dst_dir,
900                          src_inode,
901                          dst_inode);
902
903         trans = bch2_trans_get(c);
904
905         ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
906                 bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
907         if (ret)
908                 goto err_tx_restart;
909
910         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
911                 ret = bch2_fs_quota_transfer(c, src_inode,
912                                              dst_dir->ei_qid,
913                                              1 << QTYP_PRJ,
914                                              KEY_TYPE_QUOTA_PREALLOC);
915                 if (ret)
916                         goto err;
917         }
918
919         if (mode == BCH_RENAME_EXCHANGE &&
920             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
921                 ret = bch2_fs_quota_transfer(c, dst_inode,
922                                              src_dir->ei_qid,
923                                              1 << QTYP_PRJ,
924                                              KEY_TYPE_QUOTA_PREALLOC);
925                 if (ret)
926                         goto err;
927         }
928 retry:
929         bch2_trans_begin(trans);
930
931         ret = bch2_rename_trans(trans,
932                                 inode_inum(src_dir), &src_dir_u,
933                                 inode_inum(dst_dir), &dst_dir_u,
934                                 &src_inode_u,
935                                 &dst_inode_u,
936                                 &src_dentry->d_name,
937                                 &dst_dentry->d_name,
938                                 mode);
939         if (unlikely(ret))
940                 goto err_tx_restart;
941
942         if (whiteout) {
943                 whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
944                 ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
945                 if (unlikely(ret))
946                         goto err_tx_restart;
947                 bch2_inode_init_early(c, whiteout_inode_u);
948
949                 ret = bch2_create_trans(trans,
950                                         inode_inum(src_dir), &src_dir_u,
951                                         whiteout_inode_u,
952                                         &src_dentry->d_name,
953                                         from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
954                                         from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
955                                         S_IFCHR|WHITEOUT_MODE, 0,
956                                         NULL, NULL, (subvol_inum) { 0 }, 0) ?:
957                       bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
958                                       KEY_TYPE_QUOTA_PREALLOC);
959                 if (unlikely(ret))
960                         goto err_tx_restart;
961         }
962
963         ret = bch2_trans_commit(trans, NULL, NULL, 0);
964         if (unlikely(ret)) {
965 err_tx_restart:
966                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
967                         goto retry;
968                 goto err;
969         }
970
971         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
972         BUG_ON(dst_inode &&
973                dst_inode->v.i_ino != dst_inode_u.bi_inum);
974
975         bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
976                                       ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
977
978         if (src_dir != dst_dir)
979                 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
980                                               ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
981
982         bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
983                                       ATTR_CTIME);
984
985         if (dst_inode)
986                 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
987                                               ATTR_CTIME);
988 err:
989         bch2_trans_put(trans);
990
991         bch2_fs_quota_transfer(c, src_inode,
992                                bch_qid(&src_inode->ei_inode),
993                                1 << QTYP_PRJ,
994                                KEY_TYPE_QUOTA_NOCHECK);
995         if (dst_inode)
996                 bch2_fs_quota_transfer(c, dst_inode,
997                                        bch_qid(&dst_inode->ei_inode),
998                                        1 << QTYP_PRJ,
999                                        KEY_TYPE_QUOTA_NOCHECK);
1000
1001         bch2_unlock_inodes(INODE_UPDATE_LOCK,
1002                            src_dir,
1003                            dst_dir,
1004                            src_inode,
1005                            dst_inode);
1006
1007         return bch2_err_class(ret);
1008 }
1009
1010 static void bch2_setattr_copy(struct mnt_idmap *idmap,
1011                               struct bch_inode_info *inode,
1012                               struct bch_inode_unpacked *bi,
1013                               struct iattr *attr)
1014 {
1015         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1016         unsigned int ia_valid = attr->ia_valid;
1017         kuid_t kuid;
1018         kgid_t kgid;
1019
1020         if (ia_valid & ATTR_UID) {
1021                 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
1022                 bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
1023         }
1024         if (ia_valid & ATTR_GID) {
1025                 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
1026                 bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
1027         }
1028
1029         if (ia_valid & ATTR_SIZE)
1030                 bi->bi_size = attr->ia_size;
1031
1032         if (ia_valid & ATTR_ATIME)
1033                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
1034         if (ia_valid & ATTR_MTIME)
1035                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
1036         if (ia_valid & ATTR_CTIME)
1037                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
1038
1039         if (ia_valid & ATTR_MODE) {
1040                 umode_t mode = attr->ia_mode;
1041                 kgid_t gid = ia_valid & ATTR_GID
1042                         ? kgid
1043                         : inode->v.i_gid;
1044
1045                 if (!in_group_or_capable(idmap, &inode->v,
1046                         make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
1047                         mode &= ~S_ISGID;
1048                 bi->bi_mode = mode;
1049         }
1050 }
1051
1052 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
1053                          struct bch_inode_info *inode,
1054                          struct iattr *attr)
1055 {
1056         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1057         struct bch_qid qid;
1058         struct btree_trans *trans;
1059         struct btree_iter inode_iter = { NULL };
1060         struct bch_inode_unpacked inode_u;
1061         struct posix_acl *acl = NULL;
1062         kuid_t kuid;
1063         kgid_t kgid;
1064         int ret;
1065
1066         mutex_lock(&inode->ei_update_lock);
1067
1068         qid = inode->ei_qid;
1069
1070         if (attr->ia_valid & ATTR_UID) {
1071                 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
1072                 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
1073         }
1074
1075         if (attr->ia_valid & ATTR_GID) {
1076                 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
1077                 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
1078         }
1079
1080         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
1081                                      KEY_TYPE_QUOTA_PREALLOC);
1082         if (ret)
1083                 goto err;
1084
1085         trans = bch2_trans_get(c);
1086 retry:
1087         bch2_trans_begin(trans);
1088         kfree(acl);
1089         acl = NULL;
1090
1091         ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
1092                               BTREE_ITER_intent);
1093         if (ret)
1094                 goto btree_err;
1095
1096         bch2_setattr_copy(idmap, inode, &inode_u, attr);
1097
1098         if (attr->ia_valid & ATTR_MODE) {
1099                 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
1100                                      inode_u.bi_mode, &acl);
1101                 if (ret)
1102                         goto btree_err;
1103         }
1104
1105         ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
1106                 bch2_trans_commit(trans, NULL, NULL,
1107                                   BCH_TRANS_COMMIT_no_enospc);
1108 btree_err:
1109         bch2_trans_iter_exit(trans, &inode_iter);
1110
1111         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1112                 goto retry;
1113         if (unlikely(ret))
1114                 goto err_trans;
1115
1116         bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
1117
1118         if (acl)
1119                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
1120 err_trans:
1121         bch2_trans_put(trans);
1122 err:
1123         mutex_unlock(&inode->ei_update_lock);
1124
1125         return bch2_err_class(ret);
1126 }
1127
1128 static int bch2_getattr(struct mnt_idmap *idmap,
1129                         const struct path *path, struct kstat *stat,
1130                         u32 request_mask, unsigned query_flags)
1131 {
1132         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
1133         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1134         vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
1135         vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
1136
1137         stat->dev       = inode->v.i_sb->s_dev;
1138         stat->ino       = inode->v.i_ino;
1139         stat->mode      = inode->v.i_mode;
1140         stat->nlink     = inode->v.i_nlink;
1141         stat->uid       = vfsuid_into_kuid(vfsuid);
1142         stat->gid       = vfsgid_into_kgid(vfsgid);
1143         stat->rdev      = inode->v.i_rdev;
1144         stat->size      = i_size_read(&inode->v);
1145         stat->atime     = inode_get_atime(&inode->v);
1146         stat->mtime     = inode_get_mtime(&inode->v);
1147         stat->ctime     = inode_get_ctime(&inode->v);
1148         stat->blksize   = block_bytes(c);
1149         stat->blocks    = inode->v.i_blocks;
1150
1151         stat->subvol    = inode->ei_inum.subvol;
1152         stat->result_mask |= STATX_SUBVOL;
1153
1154         if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
1155                 stat->result_mask |= STATX_DIOALIGN;
1156                 /*
1157                  * this is incorrect; we should be tracking this in superblock,
1158                  * and checking the alignment of open devices
1159                  */
1160                 stat->dio_mem_align = SECTOR_SIZE;
1161                 stat->dio_offset_align = block_bytes(c);
1162         }
1163
1164         if (request_mask & STATX_BTIME) {
1165                 stat->result_mask |= STATX_BTIME;
1166                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
1167         }
1168
1169         if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
1170                 stat->attributes |= STATX_ATTR_IMMUTABLE;
1171         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
1172
1173         if (inode->ei_inode.bi_flags & BCH_INODE_append)
1174                 stat->attributes |= STATX_ATTR_APPEND;
1175         stat->attributes_mask    |= STATX_ATTR_APPEND;
1176
1177         if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
1178                 stat->attributes |= STATX_ATTR_NODUMP;
1179         stat->attributes_mask    |= STATX_ATTR_NODUMP;
1180
1181         return 0;
1182 }
1183
1184 static int bch2_setattr(struct mnt_idmap *idmap,
1185                         struct dentry *dentry, struct iattr *iattr)
1186 {
1187         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
1188         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1189         int ret;
1190
1191         lockdep_assert_held(&inode->v.i_rwsem);
1192
1193         ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
1194                 setattr_prepare(idmap, dentry, iattr);
1195         if (ret)
1196                 return ret;
1197
1198         return iattr->ia_valid & ATTR_SIZE
1199                 ? bchfs_truncate(idmap, inode, iattr)
1200                 : bch2_setattr_nonsize(idmap, inode, iattr);
1201 }
1202
1203 static int bch2_tmpfile(struct mnt_idmap *idmap,
1204                         struct inode *vdir, struct file *file, umode_t mode)
1205 {
1206         struct bch_inode_info *inode =
1207                 __bch2_create(idmap, to_bch_ei(vdir),
1208                               file->f_path.dentry, mode, 0,
1209                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
1210
1211         if (IS_ERR(inode))
1212                 return bch2_err_class(PTR_ERR(inode));
1213
1214         d_mark_tmpfile(file, &inode->v);
1215         d_instantiate(file->f_path.dentry, &inode->v);
1216         return finish_open_simple(file, 0);
1217 }
1218
1219 static int bch2_fill_extent(struct bch_fs *c,
1220                             struct fiemap_extent_info *info,
1221                             struct bkey_s_c k, unsigned flags)
1222 {
1223         if (bkey_extent_is_direct_data(k.k)) {
1224                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1225                 const union bch_extent_entry *entry;
1226                 struct extent_ptr_decoded p;
1227                 int ret;
1228
1229                 if (k.k->type == KEY_TYPE_reflink_v)
1230                         flags |= FIEMAP_EXTENT_SHARED;
1231
1232                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1233                         int flags2 = 0;
1234                         u64 offset = p.ptr.offset;
1235
1236                         if (p.ptr.unwritten)
1237                                 flags2 |= FIEMAP_EXTENT_UNWRITTEN;
1238
1239                         if (p.crc.compression_type)
1240                                 flags2 |= FIEMAP_EXTENT_ENCODED;
1241                         else
1242                                 offset += p.crc.offset;
1243
1244                         if ((offset & (block_sectors(c) - 1)) ||
1245                             (k.k->size & (block_sectors(c) - 1)))
1246                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
1247
1248                         ret = fiemap_fill_next_extent(info,
1249                                                 bkey_start_offset(k.k) << 9,
1250                                                 offset << 9,
1251                                                 k.k->size << 9, flags|flags2);
1252                         if (ret)
1253                                 return ret;
1254                 }
1255
1256                 return 0;
1257         } else if (bkey_extent_is_inline_data(k.k)) {
1258                 return fiemap_fill_next_extent(info,
1259                                                bkey_start_offset(k.k) << 9,
1260                                                0, k.k->size << 9,
1261                                                flags|
1262                                                FIEMAP_EXTENT_DATA_INLINE);
1263         } else if (k.k->type == KEY_TYPE_reservation) {
1264                 return fiemap_fill_next_extent(info,
1265                                                bkey_start_offset(k.k) << 9,
1266                                                0, k.k->size << 9,
1267                                                flags|
1268                                                FIEMAP_EXTENT_DELALLOC|
1269                                                FIEMAP_EXTENT_UNWRITTEN);
1270         } else {
1271                 BUG();
1272         }
1273 }
1274
1275 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
1276                        u64 start, u64 len)
1277 {
1278         struct bch_fs *c = vinode->i_sb->s_fs_info;
1279         struct bch_inode_info *ei = to_bch_ei(vinode);
1280         struct btree_trans *trans;
1281         struct btree_iter iter;
1282         struct bkey_s_c k;
1283         struct bkey_buf cur, prev;
1284         bool have_extent = false;
1285         int ret = 0;
1286
1287         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
1288         if (ret)
1289                 return ret;
1290
1291         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
1292         if (start + len < start)
1293                 return -EINVAL;
1294
1295         start >>= 9;
1296
1297         bch2_bkey_buf_init(&cur);
1298         bch2_bkey_buf_init(&prev);
1299         trans = bch2_trans_get(c);
1300
1301         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1302                              POS(ei->v.i_ino, start), 0);
1303
1304         while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1305                 enum btree_id data_btree = BTREE_ID_extents;
1306
1307                 bch2_trans_begin(trans);
1308
1309                 u32 snapshot;
1310                 ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
1311                 if (ret)
1312                         continue;
1313
1314                 bch2_btree_iter_set_snapshot(&iter, snapshot);
1315
1316                 k = bch2_btree_iter_peek_max(&iter, end);
1317                 ret = bkey_err(k);
1318                 if (ret)
1319                         continue;
1320
1321                 if (!k.k)
1322                         break;
1323
1324                 if (!bkey_extent_is_data(k.k) &&
1325                     k.k->type != KEY_TYPE_reservation) {
1326                         bch2_btree_iter_advance(&iter);
1327                         continue;
1328                 }
1329
1330                 s64 offset_into_extent  = iter.pos.offset - bkey_start_offset(k.k);
1331                 unsigned sectors        = k.k->size - offset_into_extent;
1332
1333                 bch2_bkey_buf_reassemble(&cur, c, k);
1334
1335                 ret = bch2_read_indirect_extent(trans, &data_btree,
1336                                         &offset_into_extent, &cur);
1337                 if (ret)
1338                         continue;
1339
1340                 k = bkey_i_to_s_c(cur.k);
1341                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1342
1343                 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1344
1345                 bch2_cut_front(POS(k.k->p.inode,
1346                                    bkey_start_offset(k.k) +
1347                                    offset_into_extent),
1348                                cur.k);
1349                 bch2_key_resize(&cur.k->k, sectors);
1350                 cur.k->k.p = iter.pos;
1351                 cur.k->k.p.offset += cur.k->k.size;
1352
1353                 if (have_extent) {
1354                         bch2_trans_unlock(trans);
1355                         ret = bch2_fill_extent(c, info,
1356                                         bkey_i_to_s_c(prev.k), 0);
1357                         if (ret)
1358                                 break;
1359                 }
1360
1361                 bkey_copy(prev.k, cur.k);
1362                 have_extent = true;
1363
1364                 bch2_btree_iter_set_pos(&iter,
1365                         POS(iter.pos.inode, iter.pos.offset + sectors));
1366         }
1367         bch2_trans_iter_exit(trans, &iter);
1368
1369         if (!ret && have_extent) {
1370                 bch2_trans_unlock(trans);
1371                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1372                                        FIEMAP_EXTENT_LAST);
1373         }
1374
1375         bch2_trans_put(trans);
1376         bch2_bkey_buf_exit(&cur, c);
1377         bch2_bkey_buf_exit(&prev, c);
1378         return ret < 0 ? ret : 0;
1379 }
1380
1381 static const struct vm_operations_struct bch_vm_ops = {
1382         .fault          = bch2_page_fault,
1383         .map_pages      = filemap_map_pages,
1384         .page_mkwrite   = bch2_page_mkwrite,
1385 };
1386
1387 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1388 {
1389         file_accessed(file);
1390
1391         vma->vm_ops = &bch_vm_ops;
1392         return 0;
1393 }
1394
1395 /* Directories: */
1396
1397 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1398 {
1399         return generic_file_llseek_size(file, offset, whence,
1400                                         S64_MAX, S64_MAX);
1401 }
1402
1403 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1404 {
1405         struct bch_inode_info *inode = file_bch_inode(file);
1406         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1407
1408         if (!dir_emit_dots(file, ctx))
1409                 return 0;
1410
1411         int ret = bch2_readdir(c, inode_inum(inode), ctx);
1412
1413         bch_err_fn(c, ret);
1414         return bch2_err_class(ret);
1415 }
1416
1417 static int bch2_open(struct inode *vinode, struct file *file)
1418 {
1419         if (file->f_flags & (O_WRONLY|O_RDWR)) {
1420                 struct bch_inode_info *inode = to_bch_ei(vinode);
1421                 struct bch_fs *c = inode->v.i_sb->s_fs_info;
1422
1423                 int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
1424                 if (ret)
1425                         return ret;
1426         }
1427
1428         file->f_mode |= FMODE_CAN_ODIRECT;
1429
1430         return generic_file_open(vinode, file);
1431 }
1432
1433 static const struct file_operations bch_file_operations = {
1434         .open           = bch2_open,
1435         .llseek         = bch2_llseek,
1436         .read_iter      = bch2_read_iter,
1437         .write_iter     = bch2_write_iter,
1438         .mmap           = bch2_mmap,
1439         .get_unmapped_area = thp_get_unmapped_area,
1440         .fsync          = bch2_fsync,
1441         .splice_read    = filemap_splice_read,
1442         .splice_write   = iter_file_splice_write,
1443         .fallocate      = bch2_fallocate_dispatch,
1444         .unlocked_ioctl = bch2_fs_file_ioctl,
1445 #ifdef CONFIG_COMPAT
1446         .compat_ioctl   = bch2_compat_fs_ioctl,
1447 #endif
1448         .remap_file_range = bch2_remap_file_range,
1449 };
1450
1451 static const struct inode_operations bch_file_inode_operations = {
1452         .getattr        = bch2_getattr,
1453         .setattr        = bch2_setattr,
1454         .fiemap         = bch2_fiemap,
1455         .listxattr      = bch2_xattr_list,
1456 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1457         .get_inode_acl  = bch2_get_acl,
1458         .set_acl        = bch2_set_acl,
1459 #endif
1460 };
1461
1462 static const struct inode_operations bch_dir_inode_operations = {
1463         .lookup         = bch2_lookup,
1464         .create         = bch2_create,
1465         .link           = bch2_link,
1466         .unlink         = bch2_unlink,
1467         .symlink        = bch2_symlink,
1468         .mkdir          = bch2_mkdir,
1469         .rmdir          = bch2_unlink,
1470         .mknod          = bch2_mknod,
1471         .rename         = bch2_rename2,
1472         .getattr        = bch2_getattr,
1473         .setattr        = bch2_setattr,
1474         .tmpfile        = bch2_tmpfile,
1475         .listxattr      = bch2_xattr_list,
1476 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1477         .get_inode_acl  = bch2_get_acl,
1478         .set_acl        = bch2_set_acl,
1479 #endif
1480 };
1481
1482 static const struct file_operations bch_dir_file_operations = {
1483         .llseek         = bch2_dir_llseek,
1484         .read           = generic_read_dir,
1485         .iterate_shared = bch2_vfs_readdir,
1486         .fsync          = bch2_fsync,
1487         .unlocked_ioctl = bch2_fs_file_ioctl,
1488 #ifdef CONFIG_COMPAT
1489         .compat_ioctl   = bch2_compat_fs_ioctl,
1490 #endif
1491 };
1492
1493 static const struct inode_operations bch_symlink_inode_operations = {
1494         .get_link       = page_get_link,
1495         .getattr        = bch2_getattr,
1496         .setattr        = bch2_setattr,
1497         .listxattr      = bch2_xattr_list,
1498 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1499         .get_inode_acl  = bch2_get_acl,
1500         .set_acl        = bch2_set_acl,
1501 #endif
1502 };
1503
1504 static const struct inode_operations bch_special_inode_operations = {
1505         .getattr        = bch2_getattr,
1506         .setattr        = bch2_setattr,
1507         .listxattr      = bch2_xattr_list,
1508 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1509         .get_inode_acl  = bch2_get_acl,
1510         .set_acl        = bch2_set_acl,
1511 #endif
1512 };
1513
1514 static const struct address_space_operations bch_address_space_operations = {
1515         .read_folio     = bch2_read_folio,
1516         .writepages     = bch2_writepages,
1517         .readahead      = bch2_readahead,
1518         .dirty_folio    = filemap_dirty_folio,
1519         .write_begin    = bch2_write_begin,
1520         .write_end      = bch2_write_end,
1521         .invalidate_folio = bch2_invalidate_folio,
1522         .release_folio  = bch2_release_folio,
1523 #ifdef CONFIG_MIGRATION
1524         .migrate_folio  = filemap_migrate_folio,
1525 #endif
1526         .error_remove_folio = generic_error_remove_folio,
1527 };
1528
1529 struct bcachefs_fid {
1530         u64             inum;
1531         u32             subvol;
1532         u32             gen;
1533 } __packed;
1534
1535 struct bcachefs_fid_with_parent {
1536         struct bcachefs_fid     fid;
1537         struct bcachefs_fid     dir;
1538 } __packed;
1539
1540 static int bcachefs_fid_valid(int fh_len, int fh_type)
1541 {
1542         switch (fh_type) {
1543         case FILEID_BCACHEFS_WITHOUT_PARENT:
1544                 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1545         case FILEID_BCACHEFS_WITH_PARENT:
1546                 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1547         default:
1548                 return false;
1549         }
1550 }
1551
1552 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1553 {
1554         return (struct bcachefs_fid) {
1555                 .inum   = inode->ei_inum.inum,
1556                 .subvol = inode->ei_inum.subvol,
1557                 .gen    = inode->ei_inode.bi_generation,
1558         };
1559 }
1560
1561 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1562                           struct inode *vdir)
1563 {
1564         struct bch_inode_info *inode    = to_bch_ei(vinode);
1565         struct bch_inode_info *dir      = to_bch_ei(vdir);
1566         int min_len;
1567
1568         if (!S_ISDIR(inode->v.i_mode) && dir) {
1569                 struct bcachefs_fid_with_parent *fid = (void *) fh;
1570
1571                 min_len = sizeof(*fid) / sizeof(u32);
1572                 if (*len < min_len) {
1573                         *len = min_len;
1574                         return FILEID_INVALID;
1575                 }
1576
1577                 fid->fid = bch2_inode_to_fid(inode);
1578                 fid->dir = bch2_inode_to_fid(dir);
1579
1580                 *len = min_len;
1581                 return FILEID_BCACHEFS_WITH_PARENT;
1582         } else {
1583                 struct bcachefs_fid *fid = (void *) fh;
1584
1585                 min_len = sizeof(*fid) / sizeof(u32);
1586                 if (*len < min_len) {
1587                         *len = min_len;
1588                         return FILEID_INVALID;
1589                 }
1590                 *fid = bch2_inode_to_fid(inode);
1591
1592                 *len = min_len;
1593                 return FILEID_BCACHEFS_WITHOUT_PARENT;
1594         }
1595 }
1596
1597 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1598                                         struct bcachefs_fid fid)
1599 {
1600         struct bch_fs *c = sb->s_fs_info;
1601         struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1602                                     .subvol = fid.subvol,
1603                                     .inum = fid.inum,
1604         });
1605         if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1606                 iput(vinode);
1607                 vinode = ERR_PTR(-ESTALE);
1608         }
1609         return vinode;
1610 }
1611
1612 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1613                 int fh_len, int fh_type)
1614 {
1615         struct bcachefs_fid *fid = (void *) _fid;
1616
1617         if (!bcachefs_fid_valid(fh_len, fh_type))
1618                 return NULL;
1619
1620         return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1621 }
1622
1623 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1624                 int fh_len, int fh_type)
1625 {
1626         struct bcachefs_fid_with_parent *fid = (void *) _fid;
1627
1628         if (!bcachefs_fid_valid(fh_len, fh_type) ||
1629             fh_type != FILEID_BCACHEFS_WITH_PARENT)
1630                 return NULL;
1631
1632         return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1633 }
1634
1635 static struct dentry *bch2_get_parent(struct dentry *child)
1636 {
1637         struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1638         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1639         subvol_inum parent_inum = {
1640                 .subvol = inode->ei_inode.bi_parent_subvol ?:
1641                         inode->ei_inum.subvol,
1642                 .inum = inode->ei_inode.bi_dir,
1643         };
1644
1645         return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1646 }
1647
1648 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1649 {
1650         struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
1651         struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
1652         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1653         struct btree_trans *trans;
1654         struct btree_iter iter1;
1655         struct btree_iter iter2;
1656         struct bkey_s_c k;
1657         struct bkey_s_c_dirent d;
1658         struct bch_inode_unpacked inode_u;
1659         subvol_inum target;
1660         u32 snapshot;
1661         struct qstr dirent_name;
1662         unsigned name_len = 0;
1663         int ret;
1664
1665         if (!S_ISDIR(dir->v.i_mode))
1666                 return -EINVAL;
1667
1668         trans = bch2_trans_get(c);
1669
1670         bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1671                              POS(dir->ei_inode.bi_inum, 0), 0);
1672         bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1673                              POS(dir->ei_inode.bi_inum, 0), 0);
1674 retry:
1675         bch2_trans_begin(trans);
1676
1677         ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
1678         if (ret)
1679                 goto err;
1680
1681         bch2_btree_iter_set_snapshot(&iter1, snapshot);
1682         bch2_btree_iter_set_snapshot(&iter2, snapshot);
1683
1684         ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1685         if (ret)
1686                 goto err;
1687
1688         if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1689                 bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1690
1691                 k = bch2_btree_iter_peek_slot(&iter1);
1692                 ret = bkey_err(k);
1693                 if (ret)
1694                         goto err;
1695
1696                 if (k.k->type != KEY_TYPE_dirent) {
1697                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1698                         goto err;
1699                 }
1700
1701                 d = bkey_s_c_to_dirent(k);
1702                 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1703                 if (ret > 0)
1704                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1705                 if (ret)
1706                         goto err;
1707
1708                 if (subvol_inum_eq(target, inode->ei_inum))
1709                         goto found;
1710         } else {
1711                 /*
1712                  * File with multiple hardlinks and our backref is to the wrong
1713                  * directory - linear search:
1714                  */
1715                 for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1716                         if (k.k->p.inode > dir->ei_inode.bi_inum)
1717                                 break;
1718
1719                         if (k.k->type != KEY_TYPE_dirent)
1720                                 continue;
1721
1722                         d = bkey_s_c_to_dirent(k);
1723                         ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1724                         if (ret < 0)
1725                                 break;
1726                         if (ret)
1727                                 continue;
1728
1729                         if (subvol_inum_eq(target, inode->ei_inum))
1730                                 goto found;
1731                 }
1732         }
1733
1734         ret = -ENOENT;
1735         goto err;
1736 found:
1737         dirent_name = bch2_dirent_get_name(d);
1738
1739         name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1740         memcpy(name, dirent_name.name, name_len);
1741         name[name_len] = '\0';
1742 err:
1743         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1744                 goto retry;
1745
1746         bch2_trans_iter_exit(trans, &iter1);
1747         bch2_trans_iter_exit(trans, &iter2);
1748         bch2_trans_put(trans);
1749
1750         return ret;
1751 }
1752
1753 static const struct export_operations bch_export_ops = {
1754         .encode_fh      = bch2_encode_fh,
1755         .fh_to_dentry   = bch2_fh_to_dentry,
1756         .fh_to_parent   = bch2_fh_to_parent,
1757         .get_parent     = bch2_get_parent,
1758         .get_name       = bch2_get_name,
1759 };
1760
1761 static void bch2_vfs_inode_init(struct btree_trans *trans,
1762                                 subvol_inum inum,
1763                                 struct bch_inode_info *inode,
1764                                 struct bch_inode_unpacked *bi,
1765                                 struct bch_subvolume *subvol)
1766 {
1767         inode->v.i_ino          = inum.inum;
1768         inode->ei_inum          = inum;
1769         inode->ei_inode.bi_inum = inum.inum;
1770         bch2_inode_update_after_write(trans, inode, bi, ~0);
1771
1772         inode->v.i_blocks       = bi->bi_sectors;
1773         inode->v.i_rdev         = bi->bi_dev;
1774         inode->v.i_generation   = bi->bi_generation;
1775         inode->v.i_size         = bi->bi_size;
1776
1777         inode->ei_flags         = 0;
1778         inode->ei_quota_reserved = 0;
1779         inode->ei_qid           = bch_qid(bi);
1780
1781         if (BCH_SUBVOLUME_SNAP(subvol))
1782                 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1783
1784         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1785
1786         switch (inode->v.i_mode & S_IFMT) {
1787         case S_IFREG:
1788                 inode->v.i_op   = &bch_file_inode_operations;
1789                 inode->v.i_fop  = &bch_file_operations;
1790                 break;
1791         case S_IFDIR:
1792                 inode->v.i_op   = &bch_dir_inode_operations;
1793                 inode->v.i_fop  = &bch_dir_file_operations;
1794                 break;
1795         case S_IFLNK:
1796                 inode_nohighmem(&inode->v);
1797                 inode->v.i_op   = &bch_symlink_inode_operations;
1798                 break;
1799         default:
1800                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1801                 inode->v.i_op   = &bch_special_inode_operations;
1802                 break;
1803         }
1804
1805         mapping_set_large_folios(inode->v.i_mapping);
1806 }
1807
1808 static void bch2_free_inode(struct inode *vinode)
1809 {
1810         kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
1811 }
1812
1813 static int inode_update_times_fn(struct btree_trans *trans,
1814                                  struct bch_inode_info *inode,
1815                                  struct bch_inode_unpacked *bi,
1816                                  void *p)
1817 {
1818         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1819
1820         bi->bi_atime    = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1821         bi->bi_mtime    = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1822         bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1823
1824         return 0;
1825 }
1826
1827 static int bch2_vfs_write_inode(struct inode *vinode,
1828                                 struct writeback_control *wbc)
1829 {
1830         struct bch_fs *c = vinode->i_sb->s_fs_info;
1831         struct bch_inode_info *inode = to_bch_ei(vinode);
1832         int ret;
1833
1834         mutex_lock(&inode->ei_update_lock);
1835         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1836                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1837         mutex_unlock(&inode->ei_update_lock);
1838
1839         return bch2_err_class(ret);
1840 }
1841
1842 static void bch2_evict_inode(struct inode *vinode)
1843 {
1844         struct bch_fs *c = vinode->i_sb->s_fs_info;
1845         struct bch_inode_info *inode = to_bch_ei(vinode);
1846         bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
1847
1848         /*
1849          * evict() has waited for outstanding writeback, we'll do no more IO
1850          * through this inode: it's safe to remove from VFS inode hashtable here
1851          *
1852          * Do that now so that other threads aren't blocked from pulling it back
1853          * in, there's no reason for them to be:
1854          */
1855         if (!delete)
1856                 bch2_inode_hash_remove(c, inode);
1857
1858         truncate_inode_pages_final(&inode->v.i_data);
1859
1860         clear_inode(&inode->v);
1861
1862         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1863
1864         if (delete) {
1865                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1866                                 KEY_TYPE_QUOTA_WARN);
1867                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1868                                 KEY_TYPE_QUOTA_WARN);
1869                 bch2_inode_rm(c, inode_inum(inode));
1870
1871                 /*
1872                  * If we are deleting, we need it present in the vfs hash table
1873                  * so that fsck can check if unlinked inodes are still open:
1874                  */
1875                 bch2_inode_hash_remove(c, inode);
1876         }
1877
1878         mutex_lock(&c->vfs_inodes_lock);
1879         list_del_init(&inode->ei_vfs_inode_list);
1880         mutex_unlock(&c->vfs_inodes_lock);
1881 }
1882
1883 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1884 {
1885         struct bch_inode_info *inode;
1886         DARRAY(struct bch_inode_info *) grabbed;
1887         bool clean_pass = false, this_pass_clean;
1888
1889         /*
1890          * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1891          * be pruned with d_mark_dontcache().
1892          *
1893          * Once we've had a clean pass where we didn't find any inodes without
1894          * I_DONTCACHE, we wait for them to be freed:
1895          */
1896
1897         darray_init(&grabbed);
1898         darray_make_room(&grabbed, 1024);
1899 again:
1900         cond_resched();
1901         this_pass_clean = true;
1902
1903         mutex_lock(&c->vfs_inodes_lock);
1904         list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1905                 if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
1906                         continue;
1907
1908                 if (!(inode->v.i_state & I_DONTCACHE) &&
1909                     !(inode->v.i_state & I_FREEING) &&
1910                     igrab(&inode->v)) {
1911                         this_pass_clean = false;
1912
1913                         if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1914                                 iput(&inode->v);
1915                                 break;
1916                         }
1917                 } else if (clean_pass && this_pass_clean) {
1918                         struct wait_bit_queue_entry wqe;
1919                         struct wait_queue_head *wq_head;
1920
1921                         wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
1922                         prepare_to_wait_event(wq_head, &wqe.wq_entry,
1923                                               TASK_UNINTERRUPTIBLE);
1924                         mutex_unlock(&c->vfs_inodes_lock);
1925
1926                         schedule();
1927                         finish_wait(wq_head, &wqe.wq_entry);
1928                         goto again;
1929                 }
1930         }
1931         mutex_unlock(&c->vfs_inodes_lock);
1932
1933         darray_for_each(grabbed, i) {
1934                 inode = *i;
1935                 d_mark_dontcache(&inode->v);
1936                 d_prune_aliases(&inode->v);
1937                 iput(&inode->v);
1938         }
1939         grabbed.nr = 0;
1940
1941         if (!clean_pass || !this_pass_clean) {
1942                 clean_pass = this_pass_clean;
1943                 goto again;
1944         }
1945
1946         darray_exit(&grabbed);
1947 }
1948
1949 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1950 {
1951         struct super_block *sb = dentry->d_sb;
1952         struct bch_fs *c = sb->s_fs_info;
1953         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1954         unsigned shift = sb->s_blocksize_bits - 9;
1955         /*
1956          * this assumes inodes take up 64 bytes, which is a decent average
1957          * number:
1958          */
1959         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1960
1961         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1962         buf->f_bsize    = sb->s_blocksize;
1963         buf->f_blocks   = usage.capacity >> shift;
1964         buf->f_bfree    = usage.free >> shift;
1965         buf->f_bavail   = avail_factor(usage.free) >> shift;
1966
1967         buf->f_files    = usage.nr_inodes + avail_inodes;
1968         buf->f_ffree    = avail_inodes;
1969
1970         buf->f_fsid     = uuid_to_fsid(c->sb.user_uuid.b);
1971         buf->f_namelen  = BCH_NAME_MAX;
1972
1973         return 0;
1974 }
1975
1976 static int bch2_sync_fs(struct super_block *sb, int wait)
1977 {
1978         struct bch_fs *c = sb->s_fs_info;
1979         int ret;
1980
1981         trace_bch2_sync_fs(sb, wait);
1982
1983         if (c->opts.journal_flush_disabled)
1984                 return 0;
1985
1986         if (!wait) {
1987                 bch2_journal_flush_async(&c->journal, NULL);
1988                 return 0;
1989         }
1990
1991         ret = bch2_journal_flush(&c->journal);
1992         return bch2_err_class(ret);
1993 }
1994
1995 static struct bch_fs *bch2_path_to_fs(const char *path)
1996 {
1997         struct bch_fs *c;
1998         dev_t dev;
1999         int ret;
2000
2001         ret = lookup_bdev(path, &dev);
2002         if (ret)
2003                 return ERR_PTR(ret);
2004
2005         c = bch2_dev_to_fs(dev);
2006         if (c)
2007                 closure_put(&c->cl);
2008         return c ?: ERR_PTR(-ENOENT);
2009 }
2010
2011 static int bch2_remount(struct super_block *sb, int *flags,
2012                         struct bch_opts opts)
2013 {
2014         struct bch_fs *c = sb->s_fs_info;
2015         int ret = 0;
2016
2017         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
2018
2019         if (opts.read_only != c->opts.read_only) {
2020                 down_write(&c->state_lock);
2021
2022                 if (opts.read_only) {
2023                         bch2_fs_read_only(c);
2024
2025                         sb->s_flags |= SB_RDONLY;
2026                 } else {
2027                         ret = bch2_fs_read_write(c);
2028                         if (ret) {
2029                                 bch_err(c, "error going rw: %i", ret);
2030                                 up_write(&c->state_lock);
2031                                 ret = -EINVAL;
2032                                 goto err;
2033                         }
2034
2035                         sb->s_flags &= ~SB_RDONLY;
2036                 }
2037
2038                 c->opts.read_only = opts.read_only;
2039
2040                 up_write(&c->state_lock);
2041         }
2042
2043         if (opt_defined(opts, errors))
2044                 c->opts.errors = opts.errors;
2045 err:
2046         return bch2_err_class(ret);
2047 }
2048
2049 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
2050 {
2051         struct bch_fs *c = root->d_sb->s_fs_info;
2052         bool first = true;
2053
2054         for_each_online_member(c, ca) {
2055                 if (!first)
2056                         seq_putc(seq, ':');
2057                 first = false;
2058                 seq_puts(seq, ca->disk_sb.sb_name);
2059         }
2060
2061         return 0;
2062 }
2063
2064 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
2065 {
2066         struct bch_fs *c = root->d_sb->s_fs_info;
2067         struct printbuf buf = PRINTBUF;
2068
2069         bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
2070                           OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
2071         printbuf_nul_terminate(&buf);
2072         seq_printf(seq, ",%s", buf.buf);
2073
2074         int ret = buf.allocation_failure ? -ENOMEM : 0;
2075         printbuf_exit(&buf);
2076         return ret;
2077 }
2078
2079 static void bch2_put_super(struct super_block *sb)
2080 {
2081         struct bch_fs *c = sb->s_fs_info;
2082
2083         __bch2_fs_stop(c);
2084 }
2085
2086 /*
2087  * bcachefs doesn't currently integrate intwrite freeze protection but the
2088  * internal write references serve the same purpose. Therefore reuse the
2089  * read-only transition code to perform the quiesce. The caveat is that we don't
2090  * currently have the ability to block tasks that want a write reference while
2091  * the superblock is frozen. This is fine for now, but we should either add
2092  * blocking support or find a way to integrate sb_start_intwrite() and friends.
2093  */
2094 static int bch2_freeze(struct super_block *sb)
2095 {
2096         struct bch_fs *c = sb->s_fs_info;
2097
2098         down_write(&c->state_lock);
2099         bch2_fs_read_only(c);
2100         up_write(&c->state_lock);
2101         return 0;
2102 }
2103
2104 static int bch2_unfreeze(struct super_block *sb)
2105 {
2106         struct bch_fs *c = sb->s_fs_info;
2107         int ret;
2108
2109         if (test_bit(BCH_FS_emergency_ro, &c->flags))
2110                 return 0;
2111
2112         down_write(&c->state_lock);
2113         ret = bch2_fs_read_write(c);
2114         up_write(&c->state_lock);
2115         return ret;
2116 }
2117
2118 static const struct super_operations bch_super_operations = {
2119         .alloc_inode    = bch2_alloc_inode,
2120         .free_inode     = bch2_free_inode,
2121         .write_inode    = bch2_vfs_write_inode,
2122         .evict_inode    = bch2_evict_inode,
2123         .sync_fs        = bch2_sync_fs,
2124         .statfs         = bch2_statfs,
2125         .show_devname   = bch2_show_devname,
2126         .show_options   = bch2_show_options,
2127         .put_super      = bch2_put_super,
2128         .freeze_fs      = bch2_freeze,
2129         .unfreeze_fs    = bch2_unfreeze,
2130 };
2131
2132 static int bch2_set_super(struct super_block *s, void *data)
2133 {
2134         s->s_fs_info = data;
2135         return 0;
2136 }
2137
2138 static int bch2_noset_super(struct super_block *s, void *data)
2139 {
2140         return -EBUSY;
2141 }
2142
2143 typedef DARRAY(struct bch_fs *) darray_fs;
2144
2145 static int bch2_test_super(struct super_block *s, void *data)
2146 {
2147         struct bch_fs *c = s->s_fs_info;
2148         darray_fs *d = data;
2149
2150         if (!c)
2151                 return false;
2152
2153         darray_for_each(*d, i)
2154                 if (c != *i)
2155                         return false;
2156         return true;
2157 }
2158
2159 static int bch2_fs_get_tree(struct fs_context *fc)
2160 {
2161         struct bch_fs *c;
2162         struct super_block *sb;
2163         struct inode *vinode;
2164         struct bch2_opts_parse *opts_parse = fc->fs_private;
2165         struct bch_opts opts = opts_parse->opts;
2166         darray_str devs;
2167         darray_fs devs_to_fs = {};
2168         int ret;
2169
2170         opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
2171         opt_set(opts, nostart, true);
2172
2173         if (!fc->source || strlen(fc->source) == 0)
2174                 return -EINVAL;
2175
2176         ret = bch2_split_devs(fc->source, &devs);
2177         if (ret)
2178                 return ret;
2179
2180         darray_for_each(devs, i) {
2181                 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
2182                 if (ret)
2183                         goto err;
2184         }
2185
2186         sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
2187         if (!IS_ERR(sb))
2188                 goto got_sb;
2189
2190         c = bch2_fs_open(devs.data, devs.nr, opts);
2191         ret = PTR_ERR_OR_ZERO(c);
2192         if (ret)
2193                 goto err;
2194
2195         /* Some options can't be parsed until after the fs is started: */
2196         opts = bch2_opts_empty();
2197         ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
2198         if (ret)
2199                 goto err_stop_fs;
2200
2201         bch2_opts_apply(&c->opts, opts);
2202
2203         ret = bch2_fs_start(c);
2204         if (ret)
2205                 goto err_stop_fs;
2206
2207         sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
2208         ret = PTR_ERR_OR_ZERO(sb);
2209         if (ret)
2210                 goto err_stop_fs;
2211 got_sb:
2212         c = sb->s_fs_info;
2213
2214         if (sb->s_root) {
2215                 if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
2216                         ret = -EBUSY;
2217                         goto err_put_super;
2218                 }
2219                 goto out;
2220         }
2221
2222         sb->s_blocksize         = block_bytes(c);
2223         sb->s_blocksize_bits    = ilog2(block_bytes(c));
2224         sb->s_maxbytes          = MAX_LFS_FILESIZE;
2225         sb->s_op                = &bch_super_operations;
2226         sb->s_export_op         = &bch_export_ops;
2227 #ifdef CONFIG_BCACHEFS_QUOTA
2228         sb->s_qcop              = &bch2_quotactl_operations;
2229         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
2230 #endif
2231         sb->s_xattr             = bch2_xattr_handlers;
2232         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
2233         sb->s_time_gran         = c->sb.nsec_per_time_unit;
2234         sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
2235         sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
2236         super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
2237         super_set_sysfs_name_uuid(sb);
2238         sb->s_shrink->seeks     = 0;
2239         c->vfs_sb               = sb;
2240         strscpy(sb->s_id, c->name, sizeof(sb->s_id));
2241
2242         ret = super_setup_bdi(sb);
2243         if (ret)
2244                 goto err_put_super;
2245
2246         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
2247
2248         for_each_online_member(c, ca) {
2249                 struct block_device *bdev = ca->disk_sb.bdev;
2250
2251                 /* XXX: create an anonymous device for multi device filesystems */
2252                 sb->s_bdev      = bdev;
2253                 sb->s_dev       = bdev->bd_dev;
2254                 percpu_ref_put(&ca->io_ref);
2255                 break;
2256         }
2257
2258         c->dev = sb->s_dev;
2259
2260 #ifdef CONFIG_BCACHEFS_POSIX_ACL
2261         if (c->opts.acl)
2262                 sb->s_flags     |= SB_POSIXACL;
2263 #endif
2264
2265         sb->s_shrink->seeks = 0;
2266
2267         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
2268         ret = PTR_ERR_OR_ZERO(vinode);
2269         bch_err_msg(c, ret, "mounting: error getting root inode");
2270         if (ret)
2271                 goto err_put_super;
2272
2273         sb->s_root = d_make_root(vinode);
2274         if (!sb->s_root) {
2275                 bch_err(c, "error mounting: error allocating root dentry");
2276                 ret = -ENOMEM;
2277                 goto err_put_super;
2278         }
2279
2280         sb->s_flags |= SB_ACTIVE;
2281 out:
2282         fc->root = dget(sb->s_root);
2283 err:
2284         darray_exit(&devs_to_fs);
2285         bch2_darray_str_exit(&devs);
2286         if (ret)
2287                 pr_err("error: %s", bch2_err_str(ret));
2288         /*
2289          * On an inconsistency error in recovery we might see an -EROFS derived
2290          * errorcode (from the journal), but we don't want to return that to
2291          * userspace as that causes util-linux to retry the mount RO - which is
2292          * confusing:
2293          */
2294         if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
2295                 ret = -EIO;
2296         return bch2_err_class(ret);
2297
2298 err_stop_fs:
2299         bch2_fs_stop(c);
2300         goto err;
2301
2302 err_put_super:
2303         __bch2_fs_stop(c);
2304         deactivate_locked_super(sb);
2305         goto err;
2306 }
2307
2308 static void bch2_kill_sb(struct super_block *sb)
2309 {
2310         struct bch_fs *c = sb->s_fs_info;
2311
2312         generic_shutdown_super(sb);
2313         bch2_fs_free(c);
2314 }
2315
2316 static void bch2_fs_context_free(struct fs_context *fc)
2317 {
2318         struct bch2_opts_parse *opts = fc->fs_private;
2319
2320         if (opts) {
2321                 printbuf_exit(&opts->parse_later);
2322                 kfree(opts);
2323         }
2324 }
2325
2326 static int bch2_fs_parse_param(struct fs_context *fc,
2327                                struct fs_parameter *param)
2328 {
2329         /*
2330          * the "source" param, i.e., the name of the device(s) to mount,
2331          * is handled by the VFS layer.
2332          */
2333         if (!strcmp(param->key, "source"))
2334                 return -ENOPARAM;
2335
2336         struct bch2_opts_parse *opts = fc->fs_private;
2337         struct bch_fs *c = NULL;
2338
2339         /* for reconfigure, we already have a struct bch_fs */
2340         if (fc->root)
2341                 c = fc->root->d_sb->s_fs_info;
2342
2343         int ret = bch2_parse_one_mount_opt(c, &opts->opts,
2344                                            &opts->parse_later, param->key,
2345                                            param->string);
2346
2347         return bch2_err_class(ret);
2348 }
2349
2350 static int bch2_fs_reconfigure(struct fs_context *fc)
2351 {
2352         struct super_block *sb = fc->root->d_sb;
2353         struct bch2_opts_parse *opts = fc->fs_private;
2354
2355         return bch2_remount(sb, &fc->sb_flags, opts->opts);
2356 }
2357
2358 static const struct fs_context_operations bch2_context_ops = {
2359         .free        = bch2_fs_context_free,
2360         .parse_param = bch2_fs_parse_param,
2361         .get_tree    = bch2_fs_get_tree,
2362         .reconfigure = bch2_fs_reconfigure,
2363 };
2364
2365 static int bch2_init_fs_context(struct fs_context *fc)
2366 {
2367         struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
2368
2369         if (!opts)
2370                 return -ENOMEM;
2371
2372         opts->parse_later = PRINTBUF;
2373
2374         fc->ops = &bch2_context_ops;
2375         fc->fs_private = opts;
2376
2377         return 0;
2378 }
2379
2380 void bch2_fs_vfs_exit(struct bch_fs *c)
2381 {
2382         if (c->vfs_inodes_by_inum_table.ht.tbl)
2383                 rhltable_destroy(&c->vfs_inodes_by_inum_table);
2384         if (c->vfs_inodes_table.tbl)
2385                 rhashtable_destroy(&c->vfs_inodes_table);
2386 }
2387
2388 int bch2_fs_vfs_init(struct bch_fs *c)
2389 {
2390         return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
2391                 rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
2392 }
2393
2394 static struct file_system_type bcache_fs_type = {
2395         .owner                  = THIS_MODULE,
2396         .name                   = "bcachefs",
2397         .init_fs_context        = bch2_init_fs_context,
2398         .kill_sb                = bch2_kill_sb,
2399         .fs_flags               = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
2400 };
2401
2402 MODULE_ALIAS_FS("bcachefs");
2403
2404 void bch2_vfs_exit(void)
2405 {
2406         unregister_filesystem(&bcache_fs_type);
2407         kmem_cache_destroy(bch2_inode_cache);
2408 }
2409
2410 int __init bch2_vfs_init(void)
2411 {
2412         int ret = -ENOMEM;
2413
2414         bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
2415                                       SLAB_ACCOUNT);
2416         if (!bch2_inode_cache)
2417                 goto err;
2418
2419         ret = register_filesystem(&bcache_fs_type);
2420         if (ret)
2421                 goto err;
2422
2423         return 0;
2424 err:
2425         bch2_vfs_exit();
2426         return ret;
2427 }
2428
2429 #endif /* NO_BCACHEFS_FS */
This page took 0.172015 seconds and 4 git commands to generate.