fs/xfs/libxfs/xfs_rmap_btree.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2014 Red Hat, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_trans.h"
  14 #include "xfs_alloc.h"
  15 #include "xfs_btree.h"
  16 #include "xfs_btree_staging.h"
  17 #include "xfs_rmap.h"
  18 #include "xfs_rmap_btree.h"
  19 #include "xfs_health.h"
  20 #include "xfs_trace.h"
  21 #include "xfs_error.h"
  22 #include "xfs_extent_busy.h"
  23 #include "xfs_ag.h"
  24 #include "xfs_ag_resv.h"
  25 #include "xfs_buf_mem.h"
  26 #include "xfs_btree_mem.h"
  27
  28 static struct kmem_cache        *xfs_rmapbt_cur_cache;
  29
  30 /*
  31  * Reverse map btree.
  32  *
  33  * This is a per-ag tree used to track the owner(s) of a given extent. With
  34  * reflink it is possible for there to be multiple owners, which is a departure
  35  * from classic XFS. Owner records for data extents are inserted when the
  36  * extent is mapped and removed when an extent is unmapped.  Owner records for
  37  * all other block types (i.e. metadata) are inserted when an extent is
  38  * allocated and removed when an extent is freed. There can only be one owner
  39  * of a metadata extent, usually an inode or some other metadata structure like
  40  * an AG btree.
  41  *
  42  * The rmap btree is part of the free space management, so blocks for the tree
  43  * are sourced from the agfl. Hence we need transaction reservation support for
  44  * this tree so that the freelist is always large enough. This also impacts on
  45  * the minimum space we need to leave free in the AG.
  46  *
  47  * The tree is ordered by [ag block, owner, offset]. This is a large key size,
  48  * but it is the only way to enforce unique keys when a block can be owned by
  49  * multiple files at any offset. There's no need to order/search by extent
  50  * size for online updating/management of the tree. It is intended that most
  51  * reverse lookups will be to find the owner(s) of a particular block, or to
  52  * try to recover tree and file data from corrupt primary metadata.
  53  */
  54
  55 static struct xfs_btree_cur *
  56 xfs_rmapbt_dup_cursor(
  57         struct xfs_btree_cur    *cur)
  58 {
  59         return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
  60                                 cur->bc_ag.agbp, to_perag(cur->bc_group));
  61 }
  62
  63 STATIC void
  64 xfs_rmapbt_set_root(
  65         struct xfs_btree_cur            *cur,
  66         const union xfs_btree_ptr       *ptr,
  67         int                             inc)
  68 {
  69         struct xfs_buf                  *agbp = cur->bc_ag.agbp;
  70         struct xfs_agf                  *agf = agbp->b_addr;
  71         struct xfs_perag                *pag = to_perag(cur->bc_group);
  72
  73         ASSERT(ptr->s != 0);
  74
  75         agf->agf_rmap_root = ptr->s;
  76         be32_add_cpu(&agf->agf_rmap_level, inc);
  77         pag->pagf_rmap_level += inc;
  78
  79         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
  80 }
  81
  82 STATIC int
  83 xfs_rmapbt_alloc_block(
  84         struct xfs_btree_cur            *cur,
  85         const union xfs_btree_ptr       *start,
  86         union xfs_btree_ptr             *new,
  87         int                             *stat)
  88 {
  89         struct xfs_buf          *agbp = cur->bc_ag.agbp;
  90         struct xfs_agf          *agf = agbp->b_addr;
  91         struct xfs_perag        *pag = to_perag(cur->bc_group);
  92         struct xfs_alloc_arg    args = { .len = 1 };
  93         int                     error;
  94         xfs_agblock_t           bno;
  95
  96         /* Allocate the new block from the freelist. If we can't, give up.  */
  97         error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
  98                                        &bno, 1);
  99         if (error)
 100                 return error;
 101         if (bno == NULLAGBLOCK) {
 102                 *stat = 0;
 103                 return 0;
 104         }
 105
 106         xfs_extent_busy_reuse(pag_group(pag), bno, 1, false);
 107
 108         new->s = cpu_to_be32(bno);
 109         be32_add_cpu(&agf->agf_rmap_blocks, 1);
 110         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 111
 112         /*
 113          * Since rmapbt blocks are sourced from the AGFL, they are allocated one
 114          * at a time and the reservation updates don't require a transaction.
 115          */
 116         xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
 117
 118         *stat = 1;
 119         return 0;
 120 }
 121
 122 STATIC int
 123 xfs_rmapbt_free_block(
 124         struct xfs_btree_cur    *cur,
 125         struct xfs_buf          *bp)
 126 {
 127         struct xfs_buf          *agbp = cur->bc_ag.agbp;
 128         struct xfs_agf          *agf = agbp->b_addr;
 129         struct xfs_perag        *pag = to_perag(cur->bc_group);
 130         xfs_agblock_t           bno;
 131         int                     error;
 132
 133         bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
 134         be32_add_cpu(&agf->agf_rmap_blocks, -1);
 135         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 136         error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
 137         if (error)
 138                 return error;
 139
 140         xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1,
 141                               XFS_EXTENT_BUSY_SKIP_DISCARD);
 142
 143         xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
 144         return 0;
 145 }
 146
 147 STATIC int
 148 xfs_rmapbt_get_minrecs(
 149         struct xfs_btree_cur    *cur,
 150         int                     level)
 151 {
 152         return cur->bc_mp->m_rmap_mnr[level != 0];
 153 }
 154
 155 STATIC int
 156 xfs_rmapbt_get_maxrecs(
 157         struct xfs_btree_cur    *cur,
 158         int                     level)
 159 {
 160         return cur->bc_mp->m_rmap_mxr[level != 0];
 161 }
 162
 163 /*
 164  * Convert the ondisk record's offset field into the ondisk key's offset field.
 165  * Fork and bmbt are significant parts of the rmap record key, but written
 166  * status is merely a record attribute.
 167  */
 168 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
 169 {
 170         return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
 171 }
 172
 173 STATIC void
 174 xfs_rmapbt_init_key_from_rec(
 175         union xfs_btree_key             *key,
 176         const union xfs_btree_rec       *rec)
 177 {
 178         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 179         key->rmap.rm_owner = rec->rmap.rm_owner;
 180         key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 181 }
 182
 183 /*
 184  * The high key for a reverse mapping record can be computed by shifting
 185  * the startblock and offset to the highest value that would still map
 186  * to that record.  In practice this means that we add blockcount-1 to
 187  * the startblock for all records, and if the record is for a data/attr
 188  * fork mapping, we add blockcount-1 to the offset too.
 189  */
 190 STATIC void
 191 xfs_rmapbt_init_high_key_from_rec(
 192         union xfs_btree_key             *key,
 193         const union xfs_btree_rec       *rec)
 194 {
 195         uint64_t                        off;
 196         int                             adj;
 197
 198         adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
 199
 200         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 201         be32_add_cpu(&key->rmap.rm_startblock, adj);
 202         key->rmap.rm_owner = rec->rmap.rm_owner;
 203         key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 204         if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
 205             XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
 206                 return;
 207         off = be64_to_cpu(key->rmap.rm_offset);
 208         off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
 209         key->rmap.rm_offset = cpu_to_be64(off);
 210 }
 211
 212 STATIC void
 213 xfs_rmapbt_init_rec_from_cur(
 214         struct xfs_btree_cur    *cur,
 215         union xfs_btree_rec     *rec)
 216 {
 217         rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
 218         rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
 219         rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
 220         rec->rmap.rm_offset = cpu_to_be64(
 221                         xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
 222 }
 223
 224 STATIC void
 225 xfs_rmapbt_init_ptr_from_cur(
 226         struct xfs_btree_cur    *cur,
 227         union xfs_btree_ptr     *ptr)
 228 {
 229         struct xfs_agf          *agf = cur->bc_ag.agbp->b_addr;
 230
 231         ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
 232
 233         ptr->s = agf->agf_rmap_root;
 234 }
 235
 236 /*
 237  * Mask the appropriate parts of the ondisk key field for a key comparison.
 238  * Fork and bmbt are significant parts of the rmap record key, but written
 239  * status is merely a record attribute.
 240  */
 241 static inline uint64_t offset_keymask(uint64_t offset)
 242 {
 243         return offset & ~XFS_RMAP_OFF_UNWRITTEN;
 244 }
 245
 246 STATIC int64_t
 247 xfs_rmapbt_key_diff(
 248         struct xfs_btree_cur            *cur,
 249         const union xfs_btree_key       *key)
 250 {
 251         struct xfs_rmap_irec            *rec = &cur->bc_rec.r;
 252         const struct xfs_rmap_key       *kp = &key->rmap;
 253         __u64                           x, y;
 254         int64_t                         d;
 255
 256         d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
 257         if (d)
 258                 return d;
 259
 260         x = be64_to_cpu(kp->rm_owner);
 261         y = rec->rm_owner;
 262         if (x > y)
 263                 return 1;
 264         else if (y > x)
 265                 return -1;
 266
 267         x = offset_keymask(be64_to_cpu(kp->rm_offset));
 268         y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
 269         if (x > y)
 270                 return 1;
 271         else if (y > x)
 272                 return -1;
 273         return 0;
 274 }
 275
 276 STATIC int64_t
 277 xfs_rmapbt_diff_two_keys(
 278         struct xfs_btree_cur            *cur,
 279         const union xfs_btree_key       *k1,
 280         const union xfs_btree_key       *k2,
 281         const union xfs_btree_key       *mask)
 282 {
 283         const struct xfs_rmap_key       *kp1 = &k1->rmap;
 284         const struct xfs_rmap_key       *kp2 = &k2->rmap;
 285         int64_t                         d;
 286         __u64                           x, y;
 287
 288         /* Doesn't make sense to mask off the physical space part */
 289         ASSERT(!mask || mask->rmap.rm_startblock);
 290
 291         d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
 292                      be32_to_cpu(kp2->rm_startblock);
 293         if (d)
 294                 return d;
 295
 296         if (!mask || mask->rmap.rm_owner) {
 297                 x = be64_to_cpu(kp1->rm_owner);
 298                 y = be64_to_cpu(kp2->rm_owner);
 299                 if (x > y)
 300                         return 1;
 301                 else if (y > x)
 302                         return -1;
 303         }
 304
 305         if (!mask || mask->rmap.rm_offset) {
 306                 /* Doesn't make sense to allow offset but not owner */
 307                 ASSERT(!mask || mask->rmap.rm_owner);
 308
 309                 x = offset_keymask(be64_to_cpu(kp1->rm_offset));
 310                 y = offset_keymask(be64_to_cpu(kp2->rm_offset));
 311                 if (x > y)
 312                         return 1;
 313                 else if (y > x)
 314                         return -1;
 315         }
 316
 317         return 0;
 318 }
 319
 320 static xfs_failaddr_t
 321 xfs_rmapbt_verify(
 322         struct xfs_buf          *bp)
 323 {
 324         struct xfs_mount        *mp = bp->b_mount;
 325         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 326         struct xfs_perag        *pag = bp->b_pag;
 327         xfs_failaddr_t          fa;
 328         unsigned int            level;
 329
 330         /*
 331          * magic number and level verification
 332          *
 333          * During growfs operations, we can't verify the exact level or owner as
 334          * the perag is not fully initialised and hence not attached to the
 335          * buffer.  In this case, check against the maximum tree depth.
 336          *
 337          * Similarly, during log recovery we will have a perag structure
 338          * attached, but the agf information will not yet have been initialised
 339          * from the on disk AGF. Again, we can only check against maximum limits
 340          * in this case.
 341          */
 342         if (!xfs_verify_magic(bp, block->bb_magic))
 343                 return __this_address;
 344
 345         if (!xfs_has_rmapbt(mp))
 346                 return __this_address;
 347         fa = xfs_btree_agblock_v5hdr_verify(bp);
 348         if (fa)
 349                 return fa;
 350
 351         level = be16_to_cpu(block->bb_level);
 352         if (pag && xfs_perag_initialised_agf(pag)) {
 353                 unsigned int    maxlevel = pag->pagf_rmap_level;
 354
 355 #ifdef CONFIG_XFS_ONLINE_REPAIR
 356                 /*
 357                  * Online repair could be rewriting the free space btrees, so
 358                  * we'll validate against the larger of either tree while this
 359                  * is going on.
 360                  */
 361                 maxlevel = max_t(unsigned int, maxlevel,
 362                                 pag->pagf_repair_rmap_level);
 363 #endif
 364                 if (level >= maxlevel)
 365                         return __this_address;
 366         } else if (level >= mp->m_rmap_maxlevels)
 367                 return __this_address;
 368
 369         return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
 370 }
 371
 372 static void
 373 xfs_rmapbt_read_verify(
 374         struct xfs_buf  *bp)
 375 {
 376         xfs_failaddr_t  fa;
 377
 378         if (!xfs_btree_agblock_verify_crc(bp))
 379                 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 380         else {
 381                 fa = xfs_rmapbt_verify(bp);
 382                 if (fa)
 383                         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 384         }
 385
 386         if (bp->b_error)
 387                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 388 }
 389
 390 static void
 391 xfs_rmapbt_write_verify(
 392         struct xfs_buf  *bp)
 393 {
 394         xfs_failaddr_t  fa;
 395
 396         fa = xfs_rmapbt_verify(bp);
 397         if (fa) {
 398                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 399                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 400                 return;
 401         }
 402         xfs_btree_agblock_calc_crc(bp);
 403
 404 }
 405
 406 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
 407         .name                   = "xfs_rmapbt",
 408         .magic                  = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
 409         .verify_read            = xfs_rmapbt_read_verify,
 410         .verify_write           = xfs_rmapbt_write_verify,
 411         .verify_struct          = xfs_rmapbt_verify,
 412 };
 413
 414 STATIC int
 415 xfs_rmapbt_keys_inorder(
 416         struct xfs_btree_cur            *cur,
 417         const union xfs_btree_key       *k1,
 418         const union xfs_btree_key       *k2)
 419 {
 420         uint32_t                x;
 421         uint32_t                y;
 422         uint64_t                a;
 423         uint64_t                b;
 424
 425         x = be32_to_cpu(k1->rmap.rm_startblock);
 426         y = be32_to_cpu(k2->rmap.rm_startblock);
 427         if (x < y)
 428                 return 1;
 429         else if (x > y)
 430                 return 0;
 431         a = be64_to_cpu(k1->rmap.rm_owner);
 432         b = be64_to_cpu(k2->rmap.rm_owner);
 433         if (a < b)
 434                 return 1;
 435         else if (a > b)
 436                 return 0;
 437         a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
 438         b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
 439         if (a <= b)
 440                 return 1;
 441         return 0;
 442 }
 443
 444 STATIC int
 445 xfs_rmapbt_recs_inorder(
 446         struct xfs_btree_cur            *cur,
 447         const union xfs_btree_rec       *r1,
 448         const union xfs_btree_rec       *r2)
 449 {
 450         uint32_t                x;
 451         uint32_t                y;
 452         uint64_t                a;
 453         uint64_t                b;
 454
 455         x = be32_to_cpu(r1->rmap.rm_startblock);
 456         y = be32_to_cpu(r2->rmap.rm_startblock);
 457         if (x < y)
 458                 return 1;
 459         else if (x > y)
 460                 return 0;
 461         a = be64_to_cpu(r1->rmap.rm_owner);
 462         b = be64_to_cpu(r2->rmap.rm_owner);
 463         if (a < b)
 464                 return 1;
 465         else if (a > b)
 466                 return 0;
 467         a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
 468         b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
 469         if (a <= b)
 470                 return 1;
 471         return 0;
 472 }
 473
 474 STATIC enum xbtree_key_contig
 475 xfs_rmapbt_keys_contiguous(
 476         struct xfs_btree_cur            *cur,
 477         const union xfs_btree_key       *key1,
 478         const union xfs_btree_key       *key2,
 479         const union xfs_btree_key       *mask)
 480 {
 481         ASSERT(!mask || mask->rmap.rm_startblock);
 482
 483         /*
 484          * We only support checking contiguity of the physical space component.
 485          * If any callers ever need more specificity than that, they'll have to
 486          * implement it here.
 487          */
 488         ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
 489
 490         return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
 491                                  be32_to_cpu(key2->rmap.rm_startblock));
 492 }
 493
 494 const struct xfs_btree_ops xfs_rmapbt_ops = {
 495         .name                   = "rmap",
 496         .type                   = XFS_BTREE_TYPE_AG,
 497         .geom_flags             = XFS_BTGEO_OVERLAPPING,
 498
 499         .rec_len                = sizeof(struct xfs_rmap_rec),
 500         /* Overlapping btree; 2 keys per pointer. */
 501         .key_len                = 2 * sizeof(struct xfs_rmap_key),
 502         .ptr_len                = XFS_BTREE_SHORT_PTR_LEN,
 503
 504         .lru_refs               = XFS_RMAP_BTREE_REF,
 505         .statoff                = XFS_STATS_CALC_INDEX(xs_rmap_2),
 506         .sick_mask              = XFS_SICK_AG_RMAPBT,
 507
 508         .dup_cursor             = xfs_rmapbt_dup_cursor,
 509         .set_root               = xfs_rmapbt_set_root,
 510         .alloc_block            = xfs_rmapbt_alloc_block,
 511         .free_block             = xfs_rmapbt_free_block,
 512         .get_minrecs            = xfs_rmapbt_get_minrecs,
 513         .get_maxrecs            = xfs_rmapbt_get_maxrecs,
 514         .init_key_from_rec      = xfs_rmapbt_init_key_from_rec,
 515         .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
 516         .init_rec_from_cur      = xfs_rmapbt_init_rec_from_cur,
 517         .init_ptr_from_cur      = xfs_rmapbt_init_ptr_from_cur,
 518         .key_diff               = xfs_rmapbt_key_diff,
 519         .buf_ops                = &xfs_rmapbt_buf_ops,
 520         .diff_two_keys          = xfs_rmapbt_diff_two_keys,
 521         .keys_inorder           = xfs_rmapbt_keys_inorder,
 522         .recs_inorder           = xfs_rmapbt_recs_inorder,
 523         .keys_contiguous        = xfs_rmapbt_keys_contiguous,
 524 };
 525
 526 /*
 527  * Create a new reverse mapping btree cursor.
 528  *
 529  * For staging cursors tp and agbp are NULL.
 530  */
 531 struct xfs_btree_cur *
 532 xfs_rmapbt_init_cursor(
 533         struct xfs_mount        *mp,
 534         struct xfs_trans        *tp,
 535         struct xfs_buf          *agbp,
 536         struct xfs_perag        *pag)
 537 {
 538         struct xfs_btree_cur    *cur;
 539
 540         cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
 541                         mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
 542         cur->bc_group = xfs_group_hold(pag_group(pag));
 543         cur->bc_ag.agbp = agbp;
 544         if (agbp) {
 545                 struct xfs_agf          *agf = agbp->b_addr;
 546
 547                 cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
 548         }
 549         return cur;
 550 }
 551
 552 #ifdef CONFIG_XFS_BTREE_IN_MEM
 553 static inline unsigned int
 554 xfs_rmapbt_mem_block_maxrecs(
 555         unsigned int            blocklen,
 556         bool                    leaf)
 557 {
 558         if (leaf)
 559                 return blocklen / sizeof(struct xfs_rmap_rec);
 560         return blocklen /
 561                 (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
 562 }
 563
 564 /*
 565  * Validate an in-memory rmap btree block.  Callers are allowed to generate an
 566  * in-memory btree even if the ondisk feature is not enabled.
 567  */
 568 static xfs_failaddr_t
 569 xfs_rmapbt_mem_verify(
 570         struct xfs_buf          *bp)
 571 {
 572         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 573         xfs_failaddr_t          fa;
 574         unsigned int            level;
 575         unsigned int            maxrecs;
 576
 577         if (!xfs_verify_magic(bp, block->bb_magic))
 578                 return __this_address;
 579
 580         fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
 581         if (fa)
 582                 return fa;
 583
 584         level = be16_to_cpu(block->bb_level);
 585         if (level >= xfs_rmapbt_maxlevels_ondisk())
 586                 return __this_address;
 587
 588         maxrecs = xfs_rmapbt_mem_block_maxrecs(
 589                         XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
 590         return xfs_btree_memblock_verify(bp, maxrecs);
 591 }
 592
 593 static void
 594 xfs_rmapbt_mem_rw_verify(
 595         struct xfs_buf  *bp)
 596 {
 597         xfs_failaddr_t  fa = xfs_rmapbt_mem_verify(bp);
 598
 599         if (fa)
 600                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 601 }
 602
 603 /* skip crc checks on in-memory btrees to save time */
 604 static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
 605         .name                   = "xfs_rmapbt_mem",
 606         .magic                  = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
 607         .verify_read            = xfs_rmapbt_mem_rw_verify,
 608         .verify_write           = xfs_rmapbt_mem_rw_verify,
 609         .verify_struct          = xfs_rmapbt_mem_verify,
 610 };
 611
 612 const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
 613         .name                   = "mem_rmap",
 614         .type                   = XFS_BTREE_TYPE_MEM,
 615         .geom_flags             = XFS_BTGEO_OVERLAPPING,
 616
 617         .rec_len                = sizeof(struct xfs_rmap_rec),
 618         /* Overlapping btree; 2 keys per pointer. */
 619         .key_len                = 2 * sizeof(struct xfs_rmap_key),
 620         .ptr_len                = XFS_BTREE_LONG_PTR_LEN,
 621
 622         .lru_refs               = XFS_RMAP_BTREE_REF,
 623         .statoff                = XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
 624
 625         .dup_cursor             = xfbtree_dup_cursor,
 626         .set_root               = xfbtree_set_root,
 627         .alloc_block            = xfbtree_alloc_block,
 628         .free_block             = xfbtree_free_block,
 629         .get_minrecs            = xfbtree_get_minrecs,
 630         .get_maxrecs            = xfbtree_get_maxrecs,
 631         .init_key_from_rec      = xfs_rmapbt_init_key_from_rec,
 632         .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
 633         .init_rec_from_cur      = xfs_rmapbt_init_rec_from_cur,
 634         .init_ptr_from_cur      = xfbtree_init_ptr_from_cur,
 635         .key_diff               = xfs_rmapbt_key_diff,
 636         .buf_ops                = &xfs_rmapbt_mem_buf_ops,
 637         .diff_two_keys          = xfs_rmapbt_diff_two_keys,
 638         .keys_inorder           = xfs_rmapbt_keys_inorder,
 639         .recs_inorder           = xfs_rmapbt_recs_inorder,
 640         .keys_contiguous        = xfs_rmapbt_keys_contiguous,
 641 };
 642
 643 /* Create a cursor for an in-memory btree. */
 644 struct xfs_btree_cur *
 645 xfs_rmapbt_mem_cursor(
 646         struct xfs_perag        *pag,
 647         struct xfs_trans        *tp,
 648         struct xfbtree          *xfbt)
 649 {
 650         struct xfs_btree_cur    *cur;
 651
 652         cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops,
 653                         xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
 654         cur->bc_mem.xfbtree = xfbt;
 655         cur->bc_nlevels = xfbt->nlevels;
 656
 657         cur->bc_group = xfs_group_hold(pag_group(pag));
 658         return cur;
 659 }
 660
 661 /* Create an in-memory rmap btree. */
 662 int
 663 xfs_rmapbt_mem_init(
 664         struct xfs_mount        *mp,
 665         struct xfbtree          *xfbt,
 666         struct xfs_buftarg      *btp,
 667         xfs_agnumber_t          agno)
 668 {
 669         xfbt->owner = agno;
 670         return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
 671 }
 672
 673 /* Compute the max possible height for reverse mapping btrees in memory. */
 674 static unsigned int
 675 xfs_rmapbt_mem_maxlevels(void)
 676 {
 677         unsigned int            minrecs[2];
 678         unsigned int            blocklen;
 679
 680         blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
 681
 682         minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
 683         minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
 684
 685         /*
 686          * How tall can an in-memory rmap btree become if we filled the entire
 687          * AG with rmap records?
 688          */
 689         return xfs_btree_compute_maxlevels(minrecs,
 690                         XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
 691 }
 692 #else
 693 # define xfs_rmapbt_mem_maxlevels()     (0)
 694 #endif /* CONFIG_XFS_BTREE_IN_MEM */
 695
 696 /*
 697  * Install a new reverse mapping btree root.  Caller is responsible for
 698  * invalidating and freeing the old btree blocks.
 699  */
 700 void
 701 xfs_rmapbt_commit_staged_btree(
 702         struct xfs_btree_cur    *cur,
 703         struct xfs_trans        *tp,
 704         struct xfs_buf          *agbp)
 705 {
 706         struct xfs_agf          *agf = agbp->b_addr;
 707         struct xbtree_afakeroot *afake = cur->bc_ag.afake;
 708
 709         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 710
 711         agf->agf_rmap_root = cpu_to_be32(afake->af_root);
 712         agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
 713         agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
 714         xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
 715                                     XFS_AGF_RMAP_BLOCKS);
 716         xfs_btree_commit_afakeroot(cur, tp, agbp);
 717 }
 718
 719 /* Calculate number of records in a reverse mapping btree block. */
 720 static inline unsigned int
 721 xfs_rmapbt_block_maxrecs(
 722         unsigned int            blocklen,
 723         bool                    leaf)
 724 {
 725         if (leaf)
 726                 return blocklen / sizeof(struct xfs_rmap_rec);
 727         return blocklen /
 728                 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
 729 }
 730
 731 /*
 732  * Calculate number of records in an rmap btree block.
 733  */
 734 unsigned int
 735 xfs_rmapbt_maxrecs(
 736         struct xfs_mount        *mp,
 737         unsigned int            blocklen,
 738         bool                    leaf)
 739 {
 740         blocklen -= XFS_RMAP_BLOCK_LEN;
 741         return xfs_rmapbt_block_maxrecs(blocklen, leaf);
 742 }
 743
 744 /* Compute the max possible height for reverse mapping btrees. */
 745 unsigned int
 746 xfs_rmapbt_maxlevels_ondisk(void)
 747 {
 748         unsigned int            minrecs[2];
 749         unsigned int            blocklen;
 750
 751         blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
 752
 753         minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
 754         minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
 755
 756         /*
 757          * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
 758          *
 759          * On a reflink filesystem, each AG block can have up to 2^32 (per the
 760          * refcount record format) owners, which means that theoretically we
 761          * could face up to 2^64 rmap records.  However, we're likely to run
 762          * out of blocks in the AG long before that happens, which means that
 763          * we must compute the max height based on what the btree will look
 764          * like if it consumes almost all the blocks in the AG due to maximal
 765          * sharing factor.
 766          */
 767         return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
 768                    xfs_rmapbt_mem_maxlevels());
 769 }
 770
 771 /* Compute the maximum height of an rmap btree. */
 772 void
 773 xfs_rmapbt_compute_maxlevels(
 774         struct xfs_mount                *mp)
 775 {
 776         if (!xfs_has_rmapbt(mp)) {
 777                 mp->m_rmap_maxlevels = 0;
 778                 return;
 779         }
 780
 781         if (xfs_has_reflink(mp)) {
 782                 /*
 783                  * Compute the asymptotic maxlevels for an rmap btree on a
 784                  * filesystem that supports reflink.
 785                  *
 786                  * On a reflink filesystem, each AG block can have up to 2^32
 787                  * (per the refcount record format) owners, which means that
 788                  * theoretically we could face up to 2^64 rmap records.
 789                  * However, we're likely to run out of blocks in the AG long
 790                  * before that happens, which means that we must compute the
 791                  * max height based on what the btree will look like if it
 792                  * consumes almost all the blocks in the AG due to maximal
 793                  * sharing factor.
 794                  */
 795                 mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
 796                                 mp->m_sb.sb_agblocks);
 797         } else {
 798                 /*
 799                  * If there's no block sharing, compute the maximum rmapbt
 800                  * height assuming one rmap record per AG block.
 801                  */
 802                 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
 803                                 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
 804         }
 805         ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
 806 }
 807
 808 /* Calculate the refcount btree size for some records. */
 809 xfs_extlen_t
 810 xfs_rmapbt_calc_size(
 811         struct xfs_mount        *mp,
 812         unsigned long long      len)
 813 {
 814         return xfs_btree_calc_size(mp->m_rmap_mnr, len);
 815 }
 816
 817 /*
 818  * Calculate the maximum refcount btree size.
 819  */
 820 xfs_extlen_t
 821 xfs_rmapbt_max_size(
 822         struct xfs_mount        *mp,
 823         xfs_agblock_t           agblocks)
 824 {
 825         /* Bail out if we're uninitialized, which can happen in mkfs. */
 826         if (mp->m_rmap_mxr[0] == 0)
 827                 return 0;
 828
 829         return xfs_rmapbt_calc_size(mp, agblocks);
 830 }
 831
 832 /*
 833  * Figure out how many blocks to reserve and how many are used by this btree.
 834  */
 835 int
 836 xfs_rmapbt_calc_reserves(
 837         struct xfs_mount        *mp,
 838         struct xfs_trans        *tp,
 839         struct xfs_perag        *pag,
 840         xfs_extlen_t            *ask,
 841         xfs_extlen_t            *used)
 842 {
 843         struct xfs_buf          *agbp;
 844         struct xfs_agf          *agf;
 845         xfs_agblock_t           agblocks;
 846         xfs_extlen_t            tree_len;
 847         int                     error;
 848
 849         if (!xfs_has_rmapbt(mp))
 850                 return 0;
 851
 852         error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 853         if (error)
 854                 return error;
 855
 856         agf = agbp->b_addr;
 857         agblocks = be32_to_cpu(agf->agf_length);
 858         tree_len = be32_to_cpu(agf->agf_rmap_blocks);
 859         xfs_trans_brelse(tp, agbp);
 860
 861         /*
 862          * The log is permanently allocated, so the space it occupies will
 863          * never be available for the kinds of things that would require btree
 864          * expansion.  We therefore can pretend the space isn't there.
 865          */
 866         if (xfs_ag_contains_log(mp, pag_agno(pag)))
 867                 agblocks -= mp->m_sb.sb_logblocks;
 868
 869         /* Reserve 1% of the AG or enough for 1 block per record. */
 870         *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
 871         *used += tree_len;
 872
 873         return error;
 874 }
 875
 876 int __init
 877 xfs_rmapbt_init_cur_cache(void)
 878 {
 879         xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
 880                         xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
 881                         0, 0, NULL);
 882
 883         if (!xfs_rmapbt_cur_cache)
 884                 return -ENOMEM;
 885         return 0;
 886 }
 887
 888 void
 889 xfs_rmapbt_destroy_cur_cache(void)
 890 {
 891         kmem_cache_destroy(xfs_rmapbt_cur_cache);
 892         xfs_rmapbt_cur_cache = NULL;
 893 }