1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_btree.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_alloc.h"
20 #include "xfs_extent_busy.h"
21 #include "xfs_errortag.h"
22 #include "xfs_error.h"
23 #include "xfs_trace.h"
24 #include "xfs_trans.h"
25 #include "xfs_buf_item.h"
27 #include "xfs_ag_resv.h"
30 extern kmem_zone_t *xfs_bmap_free_item_zone;
32 struct workqueue_struct *xfs_alloc_wq;
34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
36 #define XFSA_FIXUP_BNO_OK 1
37 #define XFSA_FIXUP_CNT_OK 2
39 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
40 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
41 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
44 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
45 * the beginning of the block for a proper header with the location information
52 unsigned int size = mp->m_sb.sb_sectsize;
54 if (xfs_sb_version_hascrc(&mp->m_sb))
55 size -= sizeof(struct xfs_agfl);
57 return size / sizeof(xfs_agblock_t);
64 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
65 return XFS_RMAP_BLOCK(mp) + 1;
66 if (xfs_sb_version_hasfinobt(&mp->m_sb))
67 return XFS_FIBT_BLOCK(mp) + 1;
68 return XFS_IBT_BLOCK(mp) + 1;
75 if (xfs_sb_version_hasreflink(&mp->m_sb))
76 return xfs_refc_block(mp) + 1;
77 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
78 return XFS_RMAP_BLOCK(mp) + 1;
79 if (xfs_sb_version_hasfinobt(&mp->m_sb))
80 return XFS_FIBT_BLOCK(mp) + 1;
81 return XFS_IBT_BLOCK(mp) + 1;
85 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
86 * AGF buffer (PV 947395), we place constraints on the relationship among
87 * actual allocations for data blocks, freelist blocks, and potential file data
88 * bmap btree blocks. However, these restrictions may result in no actual space
89 * allocated for a delayed extent, for example, a data block in a certain AG is
90 * allocated but there is no additional block for the additional bmap btree
91 * block due to a split of the bmap btree of the file. The result of this may
92 * lead to an infinite loop when the file gets flushed to disk and all delayed
93 * extents need to be actually allocated. To get around this, we explicitly set
94 * aside a few blocks which will not be reserved in delayed allocation.
96 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
97 * potential split of the file's bmap btree.
101 struct xfs_mount *mp)
103 return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
107 * When deciding how much space to allocate out of an AG, we limit the
108 * allocation maximum size to the size the AG. However, we cannot use all the
109 * blocks in the AG - some are permanently used by metadata. These
110 * blocks are generally:
111 * - the AG superblock, AGF, AGI and AGFL
112 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
113 * the AGI free inode and rmap btree root blocks.
114 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
115 * - the rmapbt root block
117 * The AG headers are sector sized, so the amount of space they take up is
118 * dependent on filesystem geometry. The others are all single blocks.
121 xfs_alloc_ag_max_usable(
122 struct xfs_mount *mp)
126 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
127 blocks += XFS_ALLOC_AGFL_RESERVE;
128 blocks += 3; /* AGF, AGI btree root blocks */
129 if (xfs_sb_version_hasfinobt(&mp->m_sb))
130 blocks++; /* finobt root block */
131 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
132 blocks++; /* rmap root block */
133 if (xfs_sb_version_hasreflink(&mp->m_sb))
134 blocks++; /* refcount root block */
136 return mp->m_sb.sb_agblocks - blocks;
140 * Lookup the record equal to [bno, len] in the btree given by cur.
142 STATIC int /* error */
144 struct xfs_btree_cur *cur, /* btree cursor */
145 xfs_agblock_t bno, /* starting block of extent */
146 xfs_extlen_t len, /* length of extent */
147 int *stat) /* success/failure */
151 cur->bc_rec.a.ar_startblock = bno;
152 cur->bc_rec.a.ar_blockcount = len;
153 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
154 cur->bc_private.a.priv.abt.active = (*stat == 1);
159 * Lookup the first record greater than or equal to [bno, len]
160 * in the btree given by cur.
164 struct xfs_btree_cur *cur, /* btree cursor */
165 xfs_agblock_t bno, /* starting block of extent */
166 xfs_extlen_t len, /* length of extent */
167 int *stat) /* success/failure */
171 cur->bc_rec.a.ar_startblock = bno;
172 cur->bc_rec.a.ar_blockcount = len;
173 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
174 cur->bc_private.a.priv.abt.active = (*stat == 1);
179 * Lookup the first record less than or equal to [bno, len]
180 * in the btree given by cur.
184 struct xfs_btree_cur *cur, /* btree cursor */
185 xfs_agblock_t bno, /* starting block of extent */
186 xfs_extlen_t len, /* length of extent */
187 int *stat) /* success/failure */
190 cur->bc_rec.a.ar_startblock = bno;
191 cur->bc_rec.a.ar_blockcount = len;
192 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
193 cur->bc_private.a.priv.abt.active = (*stat == 1);
198 xfs_alloc_cur_active(
199 struct xfs_btree_cur *cur)
201 return cur && cur->bc_private.a.priv.abt.active;
205 * Update the record referred to by cur to the value given
207 * This either works (return 0) or gets an EFSCORRUPTED error.
209 STATIC int /* error */
211 struct xfs_btree_cur *cur, /* btree cursor */
212 xfs_agblock_t bno, /* starting block of extent */
213 xfs_extlen_t len) /* length of extent */
215 union xfs_btree_rec rec;
217 rec.alloc.ar_startblock = cpu_to_be32(bno);
218 rec.alloc.ar_blockcount = cpu_to_be32(len);
219 return xfs_btree_update(cur, &rec);
223 * Get the data from the pointed-to record.
227 struct xfs_btree_cur *cur, /* btree cursor */
228 xfs_agblock_t *bno, /* output: starting block of extent */
229 xfs_extlen_t *len, /* output: length of extent */
230 int *stat) /* output: success/failure */
232 struct xfs_mount *mp = cur->bc_mp;
233 xfs_agnumber_t agno = cur->bc_private.a.agno;
234 union xfs_btree_rec *rec;
237 error = xfs_btree_get_rec(cur, &rec, stat);
238 if (error || !(*stat))
241 *bno = be32_to_cpu(rec->alloc.ar_startblock);
242 *len = be32_to_cpu(rec->alloc.ar_blockcount);
247 /* check for valid extent range, including overflow */
248 if (!xfs_verify_agbno(mp, agno, *bno))
250 if (*bno > *bno + *len)
252 if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
259 "%s Freespace BTree record corruption in AG %d detected!",
260 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
262 "start block 0x%x block count 0x%x", *bno, *len);
263 return -EFSCORRUPTED;
267 * Compute aligned version of the found extent.
268 * Takes alignment and min length into account.
271 xfs_alloc_compute_aligned(
272 xfs_alloc_arg_t *args, /* allocation argument structure */
273 xfs_agblock_t foundbno, /* starting block in found extent */
274 xfs_extlen_t foundlen, /* length in found extent */
275 xfs_agblock_t *resbno, /* result block number */
276 xfs_extlen_t *reslen, /* result length */
279 xfs_agblock_t bno = foundbno;
280 xfs_extlen_t len = foundlen;
284 /* Trim busy sections out of found extent */
285 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
288 * If we have a largish extent that happens to start before min_agbno,
289 * see if we can shift it into range...
291 if (bno < args->min_agbno && bno + len > args->min_agbno) {
292 diff = args->min_agbno - bno;
299 if (args->alignment > 1 && len >= args->minlen) {
300 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
302 diff = aligned_bno - bno;
304 *resbno = aligned_bno;
305 *reslen = diff >= len ? 0 : len - diff;
315 * Compute best start block and diff for "near" allocations.
316 * freelen >= wantlen already checked by caller.
318 STATIC xfs_extlen_t /* difference value (absolute) */
319 xfs_alloc_compute_diff(
320 xfs_agblock_t wantbno, /* target starting block */
321 xfs_extlen_t wantlen, /* target length */
322 xfs_extlen_t alignment, /* target alignment */
323 int datatype, /* are we allocating data? */
324 xfs_agblock_t freebno, /* freespace's starting block */
325 xfs_extlen_t freelen, /* freespace's length */
326 xfs_agblock_t *newbnop) /* result: best start block from free */
328 xfs_agblock_t freeend; /* end of freespace extent */
329 xfs_agblock_t newbno1; /* return block number */
330 xfs_agblock_t newbno2; /* other new block number */
331 xfs_extlen_t newlen1=0; /* length with newbno1 */
332 xfs_extlen_t newlen2=0; /* length with newbno2 */
333 xfs_agblock_t wantend; /* end of target extent */
334 bool userdata = datatype & XFS_ALLOC_USERDATA;
336 ASSERT(freelen >= wantlen);
337 freeend = freebno + freelen;
338 wantend = wantbno + wantlen;
340 * We want to allocate from the start of a free extent if it is past
341 * the desired block or if we are allocating user data and the free
342 * extent is before desired block. The second case is there to allow
343 * for contiguous allocation from the remaining free space if the file
344 * grows in the short term.
346 if (freebno >= wantbno || (userdata && freeend < wantend)) {
347 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
348 newbno1 = NULLAGBLOCK;
349 } else if (freeend >= wantend && alignment > 1) {
350 newbno1 = roundup(wantbno, alignment);
351 newbno2 = newbno1 - alignment;
352 if (newbno1 >= freeend)
353 newbno1 = NULLAGBLOCK;
355 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
356 if (newbno2 < freebno)
357 newbno2 = NULLAGBLOCK;
359 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
360 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
361 if (newlen1 < newlen2 ||
362 (newlen1 == newlen2 &&
363 XFS_ABSDIFF(newbno1, wantbno) >
364 XFS_ABSDIFF(newbno2, wantbno)))
366 } else if (newbno2 != NULLAGBLOCK)
368 } else if (freeend >= wantend) {
370 } else if (alignment > 1) {
371 newbno1 = roundup(freeend - wantlen, alignment);
372 if (newbno1 > freeend - wantlen &&
373 newbno1 - alignment >= freebno)
374 newbno1 -= alignment;
375 else if (newbno1 >= freeend)
376 newbno1 = NULLAGBLOCK;
378 newbno1 = freeend - wantlen;
380 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
384 * Fix up the length, based on mod and prod.
385 * len should be k * prod + mod for some k.
386 * If len is too small it is returned unchanged.
387 * If len hits maxlen it is left alone.
391 xfs_alloc_arg_t *args) /* allocation argument structure */
396 ASSERT(args->mod < args->prod);
398 ASSERT(rlen >= args->minlen);
399 ASSERT(rlen <= args->maxlen);
400 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
401 (args->mod == 0 && rlen < args->prod))
403 k = rlen % args->prod;
407 rlen = rlen - (k - args->mod);
409 rlen = rlen - args->prod + (args->mod - k);
410 /* casts to (int) catch length underflows */
411 if ((int)rlen < (int)args->minlen)
413 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
414 ASSERT(rlen % args->prod == args->mod);
415 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
416 rlen + args->minleft);
421 * Update the two btrees, logically removing from freespace the extent
422 * starting at rbno, rlen blocks. The extent is contained within the
423 * actual (current) free extent fbno for flen blocks.
424 * Flags are passed in indicating whether the cursors are set to the
427 STATIC int /* error code */
428 xfs_alloc_fixup_trees(
429 xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
430 xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
431 xfs_agblock_t fbno, /* starting block of free extent */
432 xfs_extlen_t flen, /* length of free extent */
433 xfs_agblock_t rbno, /* starting block of returned extent */
434 xfs_extlen_t rlen, /* length of returned extent */
435 int flags) /* flags, XFSA_FIXUP_... */
437 int error; /* error code */
438 int i; /* operation results */
439 xfs_agblock_t nfbno1; /* first new free startblock */
440 xfs_agblock_t nfbno2; /* second new free startblock */
441 xfs_extlen_t nflen1=0; /* first new free length */
442 xfs_extlen_t nflen2=0; /* second new free length */
443 struct xfs_mount *mp;
448 * Look up the record in the by-size tree if necessary.
450 if (flags & XFSA_FIXUP_CNT_OK) {
452 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
454 if (XFS_IS_CORRUPT(mp,
458 return -EFSCORRUPTED;
461 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
463 if (XFS_IS_CORRUPT(mp, i != 1))
464 return -EFSCORRUPTED;
467 * Look up the record in the by-block tree if necessary.
469 if (flags & XFSA_FIXUP_BNO_OK) {
471 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
473 if (XFS_IS_CORRUPT(mp,
477 return -EFSCORRUPTED;
480 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
482 if (XFS_IS_CORRUPT(mp, i != 1))
483 return -EFSCORRUPTED;
487 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
488 struct xfs_btree_block *bnoblock;
489 struct xfs_btree_block *cntblock;
491 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
492 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
494 if (XFS_IS_CORRUPT(mp,
495 bnoblock->bb_numrecs !=
496 cntblock->bb_numrecs))
497 return -EFSCORRUPTED;
502 * Deal with all four cases: the allocated record is contained
503 * within the freespace record, so we can have new freespace
504 * at either (or both) end, or no freespace remaining.
506 if (rbno == fbno && rlen == flen)
507 nfbno1 = nfbno2 = NULLAGBLOCK;
508 else if (rbno == fbno) {
509 nfbno1 = rbno + rlen;
510 nflen1 = flen - rlen;
511 nfbno2 = NULLAGBLOCK;
512 } else if (rbno + rlen == fbno + flen) {
514 nflen1 = flen - rlen;
515 nfbno2 = NULLAGBLOCK;
518 nflen1 = rbno - fbno;
519 nfbno2 = rbno + rlen;
520 nflen2 = (fbno + flen) - nfbno2;
523 * Delete the entry from the by-size btree.
525 if ((error = xfs_btree_delete(cnt_cur, &i)))
527 if (XFS_IS_CORRUPT(mp, i != 1))
528 return -EFSCORRUPTED;
530 * Add new by-size btree entry(s).
532 if (nfbno1 != NULLAGBLOCK) {
533 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
535 if (XFS_IS_CORRUPT(mp, i != 0))
536 return -EFSCORRUPTED;
537 if ((error = xfs_btree_insert(cnt_cur, &i)))
539 if (XFS_IS_CORRUPT(mp, i != 1))
540 return -EFSCORRUPTED;
542 if (nfbno2 != NULLAGBLOCK) {
543 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
545 if (XFS_IS_CORRUPT(mp, i != 0))
546 return -EFSCORRUPTED;
547 if ((error = xfs_btree_insert(cnt_cur, &i)))
549 if (XFS_IS_CORRUPT(mp, i != 1))
550 return -EFSCORRUPTED;
553 * Fix up the by-block btree entry(s).
555 if (nfbno1 == NULLAGBLOCK) {
557 * No remaining freespace, just delete the by-block tree entry.
559 if ((error = xfs_btree_delete(bno_cur, &i)))
561 if (XFS_IS_CORRUPT(mp, i != 1))
562 return -EFSCORRUPTED;
565 * Update the by-block entry to start later|be shorter.
567 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
570 if (nfbno2 != NULLAGBLOCK) {
572 * 2 resulting free entries, need to add one.
574 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
576 if (XFS_IS_CORRUPT(mp, i != 0))
577 return -EFSCORRUPTED;
578 if ((error = xfs_btree_insert(bno_cur, &i)))
580 if (XFS_IS_CORRUPT(mp, i != 1))
581 return -EFSCORRUPTED;
586 static xfs_failaddr_t
590 struct xfs_mount *mp = bp->b_mount;
591 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
595 * There is no verification of non-crc AGFLs because mkfs does not
596 * initialise the AGFL to zero or NULL. Hence the only valid part of the
597 * AGFL is what the AGF says is active. We can't get to the AGF, so we
598 * can't verify just those entries are valid.
600 if (!xfs_sb_version_hascrc(&mp->m_sb))
603 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
604 return __this_address;
605 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
606 return __this_address;
608 * during growfs operations, the perag is not fully initialised,
609 * so we can't use it for any useful checking. growfs ensures we can't
610 * use it by using uncached buffers that don't have the perag attached
611 * so we can detect and avoid this problem.
613 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
614 return __this_address;
616 for (i = 0; i < xfs_agfl_size(mp); i++) {
617 if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
618 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
619 return __this_address;
622 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
623 return __this_address;
628 xfs_agfl_read_verify(
631 struct xfs_mount *mp = bp->b_mount;
635 * There is no verification of non-crc AGFLs because mkfs does not
636 * initialise the AGFL to zero or NULL. Hence the only valid part of the
637 * AGFL is what the AGF says is active. We can't get to the AGF, so we
638 * can't verify just those entries are valid.
640 if (!xfs_sb_version_hascrc(&mp->m_sb))
643 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
644 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
646 fa = xfs_agfl_verify(bp);
648 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
653 xfs_agfl_write_verify(
656 struct xfs_mount *mp = bp->b_mount;
657 struct xfs_buf_log_item *bip = bp->b_log_item;
660 /* no verification of non-crc AGFLs */
661 if (!xfs_sb_version_hascrc(&mp->m_sb))
664 fa = xfs_agfl_verify(bp);
666 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
671 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
673 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
676 const struct xfs_buf_ops xfs_agfl_buf_ops = {
678 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
679 .verify_read = xfs_agfl_read_verify,
680 .verify_write = xfs_agfl_write_verify,
681 .verify_struct = xfs_agfl_verify,
685 * Read in the allocation group free block array.
689 xfs_mount_t *mp, /* mount point structure */
690 xfs_trans_t *tp, /* transaction pointer */
691 xfs_agnumber_t agno, /* allocation group number */
692 xfs_buf_t **bpp) /* buffer for the ag free block array */
694 xfs_buf_t *bp; /* return value */
697 ASSERT(agno != NULLAGNUMBER);
698 error = xfs_trans_read_buf(
699 mp, tp, mp->m_ddev_targp,
700 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
701 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
704 xfs_buf_set_ref(bp, XFS_AGFL_REF);
710 xfs_alloc_update_counters(
711 struct xfs_trans *tp,
712 struct xfs_perag *pag,
713 struct xfs_buf *agbp,
716 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
718 pag->pagf_freeblks += len;
719 be32_add_cpu(&agf->agf_freeblks, len);
721 xfs_trans_agblocks_delta(tp, len);
722 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
723 be32_to_cpu(agf->agf_length))) {
724 xfs_buf_corruption_error(agbp);
725 return -EFSCORRUPTED;
728 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
733 * Block allocation algorithm and data structures.
735 struct xfs_alloc_cur {
736 struct xfs_btree_cur *cnt; /* btree cursors */
737 struct xfs_btree_cur *bnolt;
738 struct xfs_btree_cur *bnogt;
739 xfs_extlen_t cur_len;/* current search length */
740 xfs_agblock_t rec_bno;/* extent startblock */
741 xfs_extlen_t rec_len;/* extent length */
742 xfs_agblock_t bno; /* alloc bno */
743 xfs_extlen_t len; /* alloc len */
744 xfs_extlen_t diff; /* diff from search bno */
745 unsigned int busy_gen;/* busy state */
750 * Set up cursors, etc. in the extent allocation cursor. This function can be
751 * called multiple times to reset an initialized structure without having to
752 * reallocate cursors.
756 struct xfs_alloc_arg *args,
757 struct xfs_alloc_cur *acur)
762 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
764 acur->cur_len = args->maxlen;
774 * Perform an initial cntbt lookup to check for availability of maxlen
775 * extents. If this fails, we'll return -ENOSPC to signal the caller to
776 * attempt a small allocation.
779 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
780 args->agbp, args->agno, XFS_BTNUM_CNT);
781 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
786 * Allocate the bnobt left and right search cursors.
789 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
790 args->agbp, args->agno, XFS_BTNUM_BNO);
792 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
793 args->agbp, args->agno, XFS_BTNUM_BNO);
794 return i == 1 ? 0 : -ENOSPC;
799 struct xfs_alloc_cur *acur,
802 int cur_error = XFS_BTREE_NOERROR;
805 cur_error = XFS_BTREE_ERROR;
808 xfs_btree_del_cursor(acur->cnt, cur_error);
810 xfs_btree_del_cursor(acur->bnolt, cur_error);
812 xfs_btree_del_cursor(acur->bnogt, cur_error);
813 acur->cnt = acur->bnolt = acur->bnogt = NULL;
817 * Check an extent for allocation and track the best available candidate in the
818 * allocation structure. The cursor is deactivated if it has entered an out of
819 * range state based on allocation arguments. Optionally return the extent
820 * extent geometry and allocation status if requested by the caller.
824 struct xfs_alloc_arg *args,
825 struct xfs_alloc_cur *acur,
826 struct xfs_btree_cur *cur,
830 xfs_agblock_t bno, bnoa, bnew;
831 xfs_extlen_t len, lena, diff = -1;
833 unsigned busy_gen = 0;
834 bool deactivate = false;
835 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
839 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
842 if (XFS_IS_CORRUPT(args->mp, i != 1))
843 return -EFSCORRUPTED;
846 * Check minlen and deactivate a cntbt cursor if out of acceptable size
847 * range (i.e., walking backwards looking for a minlen extent).
849 if (len < args->minlen) {
850 deactivate = !isbnobt;
854 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
858 acur->busy_gen = busy_gen;
859 /* deactivate a bnobt cursor outside of locality range */
860 if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
861 deactivate = isbnobt;
864 if (lena < args->minlen)
867 args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
868 xfs_alloc_fix_len(args);
869 ASSERT(args->len >= args->minlen);
870 if (args->len < acur->len)
874 * We have an aligned record that satisfies minlen and beats or matches
875 * the candidate extent size. Compare locality for near allocation mode.
877 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
878 diff = xfs_alloc_compute_diff(args->agbno, args->len,
879 args->alignment, args->datatype,
881 if (bnew == NULLAGBLOCK)
885 * Deactivate a bnobt cursor with worse locality than the current best.
887 if (diff > acur->diff) {
888 deactivate = isbnobt;
892 ASSERT(args->len > acur->len ||
893 (args->len == acur->len && diff <= acur->diff));
897 acur->len = args->len;
902 * We're done if we found a perfect allocation. This only deactivates
903 * the current cursor, but this is just an optimization to terminate a
904 * cntbt search that otherwise runs to the edge of the tree.
906 if (acur->diff == 0 && acur->len == args->maxlen)
910 cur->bc_private.a.priv.abt.active = false;
911 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
917 * Complete an allocation of a candidate extent. Remove the extent from both
918 * trees and update the args structure.
921 xfs_alloc_cur_finish(
922 struct xfs_alloc_arg *args,
923 struct xfs_alloc_cur *acur)
927 ASSERT(acur->cnt && acur->bnolt);
928 ASSERT(acur->bno >= acur->rec_bno);
929 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
930 ASSERT(acur->rec_bno + acur->rec_len <=
931 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
933 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
934 acur->rec_len, acur->bno, acur->len, 0);
938 args->agbno = acur->bno;
939 args->len = acur->len;
942 trace_xfs_alloc_cur(args);
947 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
948 * bno optimized lookup to search for extents with ideal size and locality.
951 xfs_alloc_cntbt_iter(
952 struct xfs_alloc_arg *args,
953 struct xfs_alloc_cur *acur)
955 struct xfs_btree_cur *cur = acur->cnt;
957 xfs_extlen_t len, cur_len;
961 if (!xfs_alloc_cur_active(cur))
964 /* locality optimized lookup */
965 cur_len = acur->cur_len;
966 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
971 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
975 /* check the current record and update search length from it */
976 error = xfs_alloc_cur_check(args, acur, cur, &i);
979 ASSERT(len >= acur->cur_len);
983 * We looked up the first record >= [agbno, len] above. The agbno is a
984 * secondary key and so the current record may lie just before or after
985 * agbno. If it is past agbno, check the previous record too so long as
986 * the length matches as it may be closer. Don't check a smaller record
987 * because that could deactivate our cursor.
989 if (bno > args->agbno) {
990 error = xfs_btree_decrement(cur, 0, &i);
992 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
993 if (!error && i && len == acur->cur_len)
994 error = xfs_alloc_cur_check(args, acur, cur,
1002 * Increment the search key until we find at least one allocation
1003 * candidate or if the extent we found was larger. Otherwise, double the
1004 * search key to optimize the search. Efficiency is more important here
1005 * than absolute best locality.
1008 if (!acur->len || acur->cur_len >= cur_len)
1011 acur->cur_len = cur_len;
1017 * Deal with the case where only small freespaces remain. Either return the
1018 * contents of the last freespace record, or allocate space from the freelist if
1019 * there is nothing in the tree.
1021 STATIC int /* error */
1022 xfs_alloc_ag_vextent_small(
1023 struct xfs_alloc_arg *args, /* allocation argument structure */
1024 struct xfs_btree_cur *ccur, /* optional by-size cursor */
1025 xfs_agblock_t *fbnop, /* result block number */
1026 xfs_extlen_t *flenp, /* result length */
1027 int *stat) /* status: 0-freelist, 1-normal/none */
1030 xfs_agblock_t fbno = NULLAGBLOCK;
1031 xfs_extlen_t flen = 0;
1035 * If a cntbt cursor is provided, try to allocate the largest record in
1036 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1037 * allocation. Make sure to respect minleft even when pulling from the
1041 error = xfs_btree_decrement(ccur, 0, &i);
1045 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1048 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1049 error = -EFSCORRUPTED;
1055 if (args->minlen != 1 || args->alignment != 1 ||
1056 args->resv == XFS_AG_RESV_AGFL ||
1057 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
1061 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1064 if (fbno == NULLAGBLOCK)
1067 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1068 (args->datatype & XFS_ALLOC_NOBUSY));
1070 if (args->datatype & XFS_ALLOC_USERDATA) {
1073 bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
1074 if (XFS_IS_CORRUPT(args->mp, !bp)) {
1075 error = -EFSCORRUPTED;
1078 xfs_trans_binval(args->tp, bp);
1080 *fbnop = args->agbno = fbno;
1081 *flenp = args->len = 1;
1082 if (XFS_IS_CORRUPT(args->mp,
1083 fbno >= be32_to_cpu(
1084 XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
1085 error = -EFSCORRUPTED;
1088 args->wasfromfl = 1;
1089 trace_xfs_alloc_small_freelist(args);
1092 * If we're feeding an AGFL block to something that doesn't live in the
1093 * free space, we need to clear out the OWN_AG rmap.
1095 error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
1096 &XFS_RMAP_OINFO_AG);
1105 * Can't do the allocation, give up.
1107 if (flen < args->minlen) {
1108 args->agbno = NULLAGBLOCK;
1109 trace_xfs_alloc_small_notenough(args);
1115 trace_xfs_alloc_small_done(args);
1119 trace_xfs_alloc_small_error(args);
1124 * Allocate a variable extent in the allocation group agno.
1125 * Type and bno are used to determine where in the allocation group the
1126 * extent will start.
1127 * Extent's length (returned in *len) will be between minlen and maxlen,
1128 * and of the form k * prod + mod unless there's nothing that large.
1129 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1131 STATIC int /* error */
1132 xfs_alloc_ag_vextent(
1133 xfs_alloc_arg_t *args) /* argument structure for allocation */
1137 ASSERT(args->minlen > 0);
1138 ASSERT(args->maxlen > 0);
1139 ASSERT(args->minlen <= args->maxlen);
1140 ASSERT(args->mod < args->prod);
1141 ASSERT(args->alignment > 0);
1144 * Branch to correct routine based on the type.
1146 args->wasfromfl = 0;
1147 switch (args->type) {
1148 case XFS_ALLOCTYPE_THIS_AG:
1149 error = xfs_alloc_ag_vextent_size(args);
1151 case XFS_ALLOCTYPE_NEAR_BNO:
1152 error = xfs_alloc_ag_vextent_near(args);
1154 case XFS_ALLOCTYPE_THIS_BNO:
1155 error = xfs_alloc_ag_vextent_exact(args);
1162 if (error || args->agbno == NULLAGBLOCK)
1165 ASSERT(args->len >= args->minlen);
1166 ASSERT(args->len <= args->maxlen);
1167 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1168 ASSERT(args->agbno % args->alignment == 0);
1170 /* if not file data, insert new block into the reverse map btree */
1171 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1172 error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
1173 args->agbno, args->len, &args->oinfo);
1178 if (!args->wasfromfl) {
1179 error = xfs_alloc_update_counters(args->tp, args->pag,
1181 -((long)(args->len)));
1185 ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
1186 args->agbno, args->len));
1189 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1191 XFS_STATS_INC(args->mp, xs_allocx);
1192 XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1197 * Allocate a variable extent at exactly agno/bno.
1198 * Extent's length (returned in *len) will be between minlen and maxlen,
1199 * and of the form k * prod + mod unless there's nothing that large.
1200 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1202 STATIC int /* error */
1203 xfs_alloc_ag_vextent_exact(
1204 xfs_alloc_arg_t *args) /* allocation argument structure */
1206 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
1207 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
1209 xfs_agblock_t fbno; /* start block of found extent */
1210 xfs_extlen_t flen; /* length of found extent */
1211 xfs_agblock_t tbno; /* start block of busy extent */
1212 xfs_extlen_t tlen; /* length of busy extent */
1213 xfs_agblock_t tend; /* end block of busy extent */
1214 int i; /* success/failure of operation */
1217 ASSERT(args->alignment == 1);
1220 * Allocate/initialize a cursor for the by-number freespace btree.
1222 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1223 args->agno, XFS_BTNUM_BNO);
1226 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1227 * Look for the closest free block <= bno, it must contain bno
1228 * if any free block does.
1230 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1237 * Grab the freespace record.
1239 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1242 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1243 error = -EFSCORRUPTED;
1246 ASSERT(fbno <= args->agbno);
1249 * Check for overlapping busy extents.
1253 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1256 * Give up if the start of the extent is busy, or the freespace isn't
1257 * long enough for the minimum request.
1259 if (tbno > args->agbno)
1261 if (tlen < args->minlen)
1264 if (tend < args->agbno + args->minlen)
1268 * End of extent will be smaller of the freespace end and the
1269 * maximal requested end.
1271 * Fix the length according to mod and prod if given.
1273 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1275 xfs_alloc_fix_len(args);
1276 ASSERT(args->agbno + args->len <= tend);
1279 * We are allocating agbno for args->len
1280 * Allocate/initialize a cursor for the by-size btree.
1282 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1283 args->agno, XFS_BTNUM_CNT);
1284 ASSERT(args->agbno + args->len <=
1285 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1286 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1287 args->len, XFSA_FIXUP_BNO_OK);
1289 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1293 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1294 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1296 args->wasfromfl = 0;
1297 trace_xfs_alloc_exact_done(args);
1301 /* Didn't find it, return null. */
1302 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1303 args->agbno = NULLAGBLOCK;
1304 trace_xfs_alloc_exact_notfound(args);
1308 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1309 trace_xfs_alloc_exact_error(args);
1314 * Search a given number of btree records in a given direction. Check each
1315 * record against the good extent we've already found.
1318 xfs_alloc_walk_iter(
1319 struct xfs_alloc_arg *args,
1320 struct xfs_alloc_cur *acur,
1321 struct xfs_btree_cur *cur,
1323 bool find_one, /* quit on first candidate */
1324 int count, /* rec count (-1 for infinite) */
1333 * Search so long as the cursor is active or we find a better extent.
1334 * The cursor is deactivated if it extends beyond the range of the
1335 * current allocation candidate.
1337 while (xfs_alloc_cur_active(cur) && count) {
1338 error = xfs_alloc_cur_check(args, acur, cur, &i);
1346 if (!xfs_alloc_cur_active(cur))
1350 error = xfs_btree_increment(cur, 0, &i);
1352 error = xfs_btree_decrement(cur, 0, &i);
1356 cur->bc_private.a.priv.abt.active = false;
1366 * Search the by-bno and by-size btrees in parallel in search of an extent with
1367 * ideal locality based on the NEAR mode ->agbno locality hint.
1370 xfs_alloc_ag_vextent_locality(
1371 struct xfs_alloc_arg *args,
1372 struct xfs_alloc_cur *acur,
1375 struct xfs_btree_cur *fbcur = NULL;
1380 ASSERT(acur->len == 0);
1381 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1385 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1388 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1391 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1396 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1397 * right and lookup the closest extent to the locality hint for each
1398 * extent size key in the cntbt. The entire search terminates
1399 * immediately on a bnobt hit because that means we've found best case
1400 * locality. Otherwise the search continues until the cntbt cursor runs
1401 * off the end of the tree. If no allocation candidate is found at this
1402 * point, give up on locality, walk backwards from the end of the cntbt
1403 * and take the first available extent.
1405 * The parallel tree searches balance each other out to provide fairly
1406 * consistent performance for various situations. The bnobt search can
1407 * have pathological behavior in the worst case scenario of larger
1408 * allocation requests and fragmented free space. On the other hand, the
1409 * bnobt is able to satisfy most smaller allocation requests much more
1410 * quickly than the cntbt. The cntbt search can sift through fragmented
1411 * free space and sets of free extents for larger allocation requests
1412 * more quickly than the bnobt. Since the locality hint is just a hint
1413 * and we don't want to scan the entire bnobt for perfect locality, the
1414 * cntbt search essentially bounds the bnobt search such that we can
1415 * find good enough locality at reasonable performance in most cases.
1417 while (xfs_alloc_cur_active(acur->bnolt) ||
1418 xfs_alloc_cur_active(acur->bnogt) ||
1419 xfs_alloc_cur_active(acur->cnt)) {
1421 trace_xfs_alloc_cur_lookup(args);
1424 * Search the bnobt left and right. In the case of a hit, finish
1425 * the search in the opposite direction and we're done.
1427 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1432 trace_xfs_alloc_cur_left(args);
1433 fbcur = acur->bnogt;
1437 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1442 trace_xfs_alloc_cur_right(args);
1443 fbcur = acur->bnolt;
1449 * Check the extent with best locality based on the current
1450 * extent size search key and keep track of the best candidate.
1452 error = xfs_alloc_cntbt_iter(args, acur);
1455 if (!xfs_alloc_cur_active(acur->cnt)) {
1456 trace_xfs_alloc_cur_lookup_done(args);
1462 * If we failed to find anything due to busy extents, return empty
1463 * handed so the caller can flush and retry. If no busy extents were
1464 * found, walk backwards from the end of the cntbt as a last resort.
1466 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1467 error = xfs_btree_decrement(acur->cnt, 0, &i);
1471 acur->cnt->bc_private.a.priv.abt.active = true;
1478 * Search in the opposite direction for a better entry in the case of
1479 * a bnobt hit or walk backwards from the end of the cntbt.
1482 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1494 /* Check the last block of the cnt btree for allocations. */
1496 xfs_alloc_ag_vextent_lastblock(
1497 struct xfs_alloc_arg *args,
1498 struct xfs_alloc_cur *acur,
1507 /* Randomly don't execute the first algorithm. */
1508 if (prandom_u32() & 1)
1513 * Start from the entry that lookup found, sequence through all larger
1514 * free blocks. If we're actually pointing at a record smaller than
1515 * maxlen, go to the start of this block, and skip all those smaller
1518 if (len || args->alignment > 1) {
1519 acur->cnt->bc_ptrs[0] = 1;
1521 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1524 if (XFS_IS_CORRUPT(args->mp, i != 1))
1525 return -EFSCORRUPTED;
1526 if (*len >= args->minlen)
1528 error = xfs_btree_increment(acur->cnt, 0, &i);
1532 ASSERT(*len >= args->minlen);
1537 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1542 * It didn't work. We COULD be in a case where there's a good record
1543 * somewhere, so try again.
1548 trace_xfs_alloc_near_first(args);
1554 * Allocate a variable extent near bno in the allocation group agno.
1555 * Extent's length (returned in len) will be between minlen and maxlen,
1556 * and of the form k * prod + mod unless there's nothing that large.
1557 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1560 xfs_alloc_ag_vextent_near(
1561 struct xfs_alloc_arg *args)
1563 struct xfs_alloc_cur acur = {};
1564 int error; /* error code */
1565 int i; /* result code, temporary */
1569 /* handle uninitialized agbno range so caller doesn't have to */
1570 if (!args->min_agbno && !args->max_agbno)
1571 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1572 ASSERT(args->min_agbno <= args->max_agbno);
1574 /* clamp agbno to the range if it's outside */
1575 if (args->agbno < args->min_agbno)
1576 args->agbno = args->min_agbno;
1577 if (args->agbno > args->max_agbno)
1578 args->agbno = args->max_agbno;
1584 * Set up cursors and see if there are any free extents as big as
1585 * maxlen. If not, pick the last entry in the tree unless the tree is
1588 error = xfs_alloc_cur_setup(args, &acur);
1589 if (error == -ENOSPC) {
1590 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1594 if (i == 0 || len == 0) {
1595 trace_xfs_alloc_near_noentry(args);
1605 * If the requested extent is large wrt the freespaces available
1606 * in this a.g., then the cursor will be pointing to a btree entry
1607 * near the right edge of the tree. If it's in the last btree leaf
1608 * block, then we just examine all the entries in that block
1609 * that are big enough, and pick the best one.
1611 if (xfs_btree_islastblock(acur.cnt, 0)) {
1612 bool allocated = false;
1614 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1623 * Second algorithm. Combined cntbt and bnobt search to find ideal
1626 error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1631 * If we couldn't get anything, give up.
1635 trace_xfs_alloc_near_busy(args);
1636 xfs_extent_busy_flush(args->mp, args->pag,
1640 trace_xfs_alloc_size_neither(args);
1641 args->agbno = NULLAGBLOCK;
1646 /* fix up btrees on a successful allocation */
1647 error = xfs_alloc_cur_finish(args, &acur);
1650 xfs_alloc_cur_close(&acur, error);
1655 * Allocate a variable extent anywhere in the allocation group agno.
1656 * Extent's length (returned in len) will be between minlen and maxlen,
1657 * and of the form k * prod + mod unless there's nothing that large.
1658 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1660 STATIC int /* error */
1661 xfs_alloc_ag_vextent_size(
1662 xfs_alloc_arg_t *args) /* allocation argument structure */
1664 xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
1665 xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
1666 int error; /* error result */
1667 xfs_agblock_t fbno; /* start of found freespace */
1668 xfs_extlen_t flen; /* length of found freespace */
1669 int i; /* temp status variable */
1670 xfs_agblock_t rbno; /* returned block number */
1671 xfs_extlen_t rlen; /* length of returned extent */
1677 * Allocate and initialize a cursor for the by-size btree.
1679 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1680 args->agno, XFS_BTNUM_CNT);
1685 * Look for an entry >= maxlen+alignment-1 blocks.
1687 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1688 args->maxlen + args->alignment - 1, &i)))
1692 * If none then we have to settle for a smaller extent. In the case that
1693 * there are no large extents, this will return the last entry in the
1694 * tree unless the tree is empty. In the case that there are only busy
1695 * large extents, this will return the largest small extent unless there
1696 * are no smaller extents available.
1699 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1703 if (i == 0 || flen == 0) {
1704 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1705 trace_xfs_alloc_size_noentry(args);
1709 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1713 * Search for a non-busy extent that is large enough.
1716 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1719 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1720 error = -EFSCORRUPTED;
1724 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1725 &rbno, &rlen, &busy_gen);
1727 if (rlen >= args->maxlen)
1730 error = xfs_btree_increment(cnt_cur, 0, &i);
1735 * Our only valid extents must have been busy.
1736 * Make it unbusy by forcing the log out and
1739 xfs_btree_del_cursor(cnt_cur,
1741 trace_xfs_alloc_size_busy(args);
1742 xfs_extent_busy_flush(args->mp,
1743 args->pag, busy_gen);
1750 * In the first case above, we got the last entry in the
1751 * by-size btree. Now we check to see if the space hits maxlen
1752 * once aligned; if not, we search left for something better.
1753 * This can't happen in the second case above.
1755 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1756 if (XFS_IS_CORRUPT(args->mp,
1759 rbno + rlen > fbno + flen))) {
1760 error = -EFSCORRUPTED;
1763 if (rlen < args->maxlen) {
1764 xfs_agblock_t bestfbno;
1765 xfs_extlen_t bestflen;
1766 xfs_agblock_t bestrbno;
1767 xfs_extlen_t bestrlen;
1774 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1778 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1781 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1782 error = -EFSCORRUPTED;
1785 if (flen < bestrlen)
1787 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1788 &rbno, &rlen, &busy_gen);
1789 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1790 if (XFS_IS_CORRUPT(args->mp,
1793 rbno + rlen > fbno + flen))) {
1794 error = -EFSCORRUPTED;
1797 if (rlen > bestrlen) {
1802 if (rlen == args->maxlen)
1806 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1809 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1810 error = -EFSCORRUPTED;
1818 args->wasfromfl = 0;
1820 * Fix up the length.
1823 if (rlen < args->minlen) {
1825 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1826 trace_xfs_alloc_size_busy(args);
1827 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1832 xfs_alloc_fix_len(args);
1835 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1836 error = -EFSCORRUPTED;
1840 * Allocate and initialize a cursor for the by-block tree.
1842 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1843 args->agno, XFS_BTNUM_BNO);
1844 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1845 rbno, rlen, XFSA_FIXUP_CNT_OK)))
1847 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1848 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1849 cnt_cur = bno_cur = NULL;
1852 if (XFS_IS_CORRUPT(args->mp,
1853 args->agbno + args->len >
1855 XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
1856 error = -EFSCORRUPTED;
1859 trace_xfs_alloc_size_done(args);
1863 trace_xfs_alloc_size_error(args);
1865 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1867 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1871 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1872 trace_xfs_alloc_size_nominleft(args);
1873 args->agbno = NULLAGBLOCK;
1878 * Free the extent starting at agno/bno for length.
1882 struct xfs_trans *tp,
1883 struct xfs_buf *agbp,
1884 xfs_agnumber_t agno,
1887 const struct xfs_owner_info *oinfo,
1888 enum xfs_ag_resv_type type)
1890 struct xfs_mount *mp;
1891 struct xfs_perag *pag;
1892 struct xfs_btree_cur *bno_cur;
1893 struct xfs_btree_cur *cnt_cur;
1894 xfs_agblock_t gtbno; /* start of right neighbor */
1895 xfs_extlen_t gtlen; /* length of right neighbor */
1896 xfs_agblock_t ltbno; /* start of left neighbor */
1897 xfs_extlen_t ltlen; /* length of left neighbor */
1898 xfs_agblock_t nbno; /* new starting block of freesp */
1899 xfs_extlen_t nlen; /* new length of freespace */
1900 int haveleft; /* have a left neighbor */
1901 int haveright; /* have a right neighbor */
1905 bno_cur = cnt_cur = NULL;
1908 if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1909 error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
1915 * Allocate and initialize a cursor for the by-block btree.
1917 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
1919 * Look for a neighboring block on the left (lower block numbers)
1920 * that is contiguous with this space.
1922 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1926 * There is a block to our left.
1928 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i)))
1930 if (XFS_IS_CORRUPT(mp, i != 1)) {
1931 error = -EFSCORRUPTED;
1935 * It's not contiguous, though.
1937 if (ltbno + ltlen < bno)
1941 * If this failure happens the request to free this
1942 * space was invalid, it's (partly) already free.
1945 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1946 error = -EFSCORRUPTED;
1952 * Look for a neighboring block on the right (higher block numbers)
1953 * that is contiguous with this space.
1955 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1959 * There is a block to our right.
1961 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i)))
1963 if (XFS_IS_CORRUPT(mp, i != 1)) {
1964 error = -EFSCORRUPTED;
1968 * It's not contiguous, though.
1970 if (bno + len < gtbno)
1974 * If this failure happens the request to free this
1975 * space was invalid, it's (partly) already free.
1978 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1979 error = -EFSCORRUPTED;
1985 * Now allocate and initialize a cursor for the by-size tree.
1987 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
1989 * Have both left and right contiguous neighbors.
1990 * Merge all three into a single free block.
1992 if (haveleft && haveright) {
1994 * Delete the old by-size entry on the left.
1996 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1998 if (XFS_IS_CORRUPT(mp, i != 1)) {
1999 error = -EFSCORRUPTED;
2002 if ((error = xfs_btree_delete(cnt_cur, &i)))
2004 if (XFS_IS_CORRUPT(mp, i != 1)) {
2005 error = -EFSCORRUPTED;
2009 * Delete the old by-size entry on the right.
2011 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2013 if (XFS_IS_CORRUPT(mp, i != 1)) {
2014 error = -EFSCORRUPTED;
2017 if ((error = xfs_btree_delete(cnt_cur, &i)))
2019 if (XFS_IS_CORRUPT(mp, i != 1)) {
2020 error = -EFSCORRUPTED;
2024 * Delete the old by-block entry for the right block.
2026 if ((error = xfs_btree_delete(bno_cur, &i)))
2028 if (XFS_IS_CORRUPT(mp, i != 1)) {
2029 error = -EFSCORRUPTED;
2033 * Move the by-block cursor back to the left neighbor.
2035 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2037 if (XFS_IS_CORRUPT(mp, i != 1)) {
2038 error = -EFSCORRUPTED;
2043 * Check that this is the right record: delete didn't
2044 * mangle the cursor.
2047 xfs_agblock_t xxbno;
2050 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2053 if (XFS_IS_CORRUPT(mp,
2057 error = -EFSCORRUPTED;
2063 * Update remaining by-block entry to the new, joined block.
2066 nlen = len + ltlen + gtlen;
2067 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2071 * Have only a left contiguous neighbor.
2072 * Merge it together with the new freespace.
2074 else if (haveleft) {
2076 * Delete the old by-size entry on the left.
2078 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2080 if (XFS_IS_CORRUPT(mp, i != 1)) {
2081 error = -EFSCORRUPTED;
2084 if ((error = xfs_btree_delete(cnt_cur, &i)))
2086 if (XFS_IS_CORRUPT(mp, i != 1)) {
2087 error = -EFSCORRUPTED;
2091 * Back up the by-block cursor to the left neighbor, and
2092 * update its length.
2094 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2096 if (XFS_IS_CORRUPT(mp, i != 1)) {
2097 error = -EFSCORRUPTED;
2102 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2106 * Have only a right contiguous neighbor.
2107 * Merge it together with the new freespace.
2109 else if (haveright) {
2111 * Delete the old by-size entry on the right.
2113 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2115 if (XFS_IS_CORRUPT(mp, i != 1)) {
2116 error = -EFSCORRUPTED;
2119 if ((error = xfs_btree_delete(cnt_cur, &i)))
2121 if (XFS_IS_CORRUPT(mp, i != 1)) {
2122 error = -EFSCORRUPTED;
2126 * Update the starting block and length of the right
2127 * neighbor in the by-block tree.
2131 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2135 * No contiguous neighbors.
2136 * Insert the new freespace into the by-block tree.
2141 if ((error = xfs_btree_insert(bno_cur, &i)))
2143 if (XFS_IS_CORRUPT(mp, i != 1)) {
2144 error = -EFSCORRUPTED;
2148 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2151 * In all cases we need to insert the new freespace in the by-size tree.
2153 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2155 if (XFS_IS_CORRUPT(mp, i != 0)) {
2156 error = -EFSCORRUPTED;
2159 if ((error = xfs_btree_insert(cnt_cur, &i)))
2161 if (XFS_IS_CORRUPT(mp, i != 1)) {
2162 error = -EFSCORRUPTED;
2165 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2169 * Update the freespace totals in the ag and superblock.
2171 pag = xfs_perag_get(mp, agno);
2172 error = xfs_alloc_update_counters(tp, pag, agbp, len);
2173 xfs_ag_resv_free_extent(pag, type, tp, len);
2178 XFS_STATS_INC(mp, xs_freex);
2179 XFS_STATS_ADD(mp, xs_freeb, len);
2181 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2186 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2188 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2190 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2195 * Visible (exported) allocation/free functions.
2196 * Some of these are used just by xfs_alloc_btree.c and this file.
2200 * Compute and fill in value of m_ag_maxlevels.
2203 xfs_alloc_compute_maxlevels(
2204 xfs_mount_t *mp) /* file system mount structure */
2206 mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2207 (mp->m_sb.sb_agblocks + 1) / 2);
2211 * Find the length of the longest extent in an AG. The 'need' parameter
2212 * specifies how much space we're going to need for the AGFL and the
2213 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2217 xfs_alloc_longest_free_extent(
2218 struct xfs_perag *pag,
2220 xfs_extlen_t reserved)
2222 xfs_extlen_t delta = 0;
2225 * If the AGFL needs a recharge, we'll have to subtract that from the
2228 if (need > pag->pagf_flcount)
2229 delta = need - pag->pagf_flcount;
2232 * If we cannot maintain others' reservations with space from the
2233 * not-longest freesp extents, we'll have to subtract /that/ from
2234 * the longest extent too.
2236 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2237 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2240 * If the longest extent is long enough to satisfy all the
2241 * reservations and AGFL rules in place, we can return this extent.
2243 if (pag->pagf_longest > delta)
2244 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2245 pag->pagf_longest - delta);
2247 /* Otherwise, let the caller try for 1 block if there's space. */
2248 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2252 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2253 * return the largest possible minimum length.
2256 xfs_alloc_min_freelist(
2257 struct xfs_mount *mp,
2258 struct xfs_perag *pag)
2260 /* AG btrees have at least 1 level. */
2261 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2262 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
2263 unsigned int min_free;
2265 ASSERT(mp->m_ag_maxlevels > 0);
2267 /* space needed by-bno freespace btree */
2268 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2269 mp->m_ag_maxlevels);
2270 /* space needed by-size freespace btree */
2271 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2272 mp->m_ag_maxlevels);
2273 /* space needed reverse mapping used space btree */
2274 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
2275 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2276 mp->m_rmap_maxlevels);
2282 * Check if the operation we are fixing up the freelist for should go ahead or
2283 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2284 * is dependent on whether the size and shape of free space available will
2285 * permit the requested allocation to take place.
2288 xfs_alloc_space_available(
2289 struct xfs_alloc_arg *args,
2290 xfs_extlen_t min_free,
2293 struct xfs_perag *pag = args->pag;
2294 xfs_extlen_t alloc_len, longest;
2295 xfs_extlen_t reservation; /* blocks that are still reserved */
2297 xfs_extlen_t agflcount;
2299 if (flags & XFS_ALLOC_FLAG_FREEING)
2302 reservation = xfs_ag_resv_needed(pag, args->resv);
2304 /* do we have enough contiguous free space for the allocation? */
2305 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2306 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2307 if (longest < alloc_len)
2311 * Do we have enough free space remaining for the allocation? Don't
2312 * account extra agfl blocks because we are about to defer free them,
2313 * making them unavailable until the current transaction commits.
2315 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2316 available = (int)(pag->pagf_freeblks + agflcount -
2317 reservation - min_free - args->minleft);
2318 if (available < (int)max(args->total, alloc_len))
2322 * Clamp maxlen to the amount of free space available for the actual
2323 * extent allocation.
2325 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2326 args->maxlen = available;
2327 ASSERT(args->maxlen > 0);
2328 ASSERT(args->maxlen >= args->minlen);
2335 xfs_free_agfl_block(
2336 struct xfs_trans *tp,
2337 xfs_agnumber_t agno,
2338 xfs_agblock_t agbno,
2339 struct xfs_buf *agbp,
2340 struct xfs_owner_info *oinfo)
2345 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2350 bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
2351 if (XFS_IS_CORRUPT(tp->t_mountp, !bp))
2352 return -EFSCORRUPTED;
2353 xfs_trans_binval(tp, bp);
2359 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2360 * is to detect an agfl header padding mismatch between current and early v5
2361 * kernels. This problem manifests as a 1-slot size difference between the
2362 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2363 * may also catch variants of agfl count corruption unrelated to padding. Either
2364 * way, we'll reset the agfl and warn the user.
2366 * Return true if a reset is required before the agfl can be used, false
2370 xfs_agfl_needs_reset(
2371 struct xfs_mount *mp,
2372 struct xfs_agf *agf)
2374 uint32_t f = be32_to_cpu(agf->agf_flfirst);
2375 uint32_t l = be32_to_cpu(agf->agf_fllast);
2376 uint32_t c = be32_to_cpu(agf->agf_flcount);
2377 int agfl_size = xfs_agfl_size(mp);
2380 /* no agfl header on v4 supers */
2381 if (!xfs_sb_version_hascrc(&mp->m_sb))
2385 * The agf read verifier catches severe corruption of these fields.
2386 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2387 * the verifier allows it.
2389 if (f >= agfl_size || l >= agfl_size)
2395 * Check consistency between the on-disk count and the active range. An
2396 * agfl padding mismatch manifests as an inconsistent flcount.
2401 active = agfl_size - f + l + 1;
2409 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2410 * agfl content cannot be trusted. Warn the user that a repair is required to
2411 * recover leaked blocks.
2413 * The purpose of this mechanism is to handle filesystems affected by the agfl
2414 * header padding mismatch problem. A reset keeps the filesystem online with a
2415 * relatively minor free space accounting inconsistency rather than suffer the
2416 * inevitable crash from use of an invalid agfl block.
2420 struct xfs_trans *tp,
2421 struct xfs_buf *agbp,
2422 struct xfs_perag *pag)
2424 struct xfs_mount *mp = tp->t_mountp;
2425 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
2427 ASSERT(pag->pagf_agflreset);
2428 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2431 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2432 "Please unmount and run xfs_repair.",
2433 pag->pag_agno, pag->pagf_flcount);
2435 agf->agf_flfirst = 0;
2436 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2437 agf->agf_flcount = 0;
2438 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2441 pag->pagf_flcount = 0;
2442 pag->pagf_agflreset = false;
2446 * Defer an AGFL block free. This is effectively equivalent to
2447 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2449 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2450 * allocation operations in a transaction. AGFL frees are prone to this problem
2451 * because for one they are always freed one at a time. Further, an immediate
2452 * AGFL block free can cause a btree join and require another block free before
2453 * the real allocation can proceed. Deferring the free disconnects freeing up
2454 * the AGFL slot from freeing the block.
2457 xfs_defer_agfl_block(
2458 struct xfs_trans *tp,
2459 xfs_agnumber_t agno,
2460 xfs_fsblock_t agbno,
2461 struct xfs_owner_info *oinfo)
2463 struct xfs_mount *mp = tp->t_mountp;
2464 struct xfs_extent_free_item *new; /* new element */
2466 ASSERT(xfs_bmap_free_item_zone != NULL);
2467 ASSERT(oinfo != NULL);
2469 new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
2470 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2471 new->xefi_blockcount = 1;
2472 new->xefi_oinfo = *oinfo;
2474 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2476 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2480 * Decide whether to use this allocation group for this allocation.
2481 * If so, fix up the btree freelist's size.
2484 xfs_alloc_fix_freelist(
2485 struct xfs_alloc_arg *args, /* allocation argument structure */
2486 int flags) /* XFS_ALLOC_FLAG_... */
2488 struct xfs_mount *mp = args->mp;
2489 struct xfs_perag *pag = args->pag;
2490 struct xfs_trans *tp = args->tp;
2491 struct xfs_buf *agbp = NULL;
2492 struct xfs_buf *agflbp = NULL;
2493 struct xfs_alloc_arg targs; /* local allocation arguments */
2494 xfs_agblock_t bno; /* freelist block */
2495 xfs_extlen_t need; /* total blocks needed in freelist */
2498 /* deferred ops (AGFL block frees) require permanent transactions */
2499 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2501 if (!pag->pagf_init) {
2502 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2505 if (!pag->pagf_init) {
2506 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
2507 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2508 goto out_agbp_relse;
2513 * If this is a metadata preferred pag and we are user data then try
2514 * somewhere else if we are not being asked to try harder at this
2517 if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2518 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2519 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2520 goto out_agbp_relse;
2523 need = xfs_alloc_min_freelist(mp, pag);
2524 if (!xfs_alloc_space_available(args, need, flags |
2525 XFS_ALLOC_FLAG_CHECK))
2526 goto out_agbp_relse;
2529 * Get the a.g. freespace buffer.
2530 * Can fail if we're not blocking on locks, and it's held.
2533 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2537 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
2538 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2543 /* reset a padding mismatched agfl before final free space check */
2544 if (pag->pagf_agflreset)
2545 xfs_agfl_reset(tp, agbp, pag);
2547 /* If there isn't enough total space or single-extent, reject it. */
2548 need = xfs_alloc_min_freelist(mp, pag);
2549 if (!xfs_alloc_space_available(args, need, flags))
2550 goto out_agbp_relse;
2553 * Make the freelist shorter if it's too long.
2555 * Note that from this point onwards, we will always release the agf and
2556 * agfl buffers on error. This handles the case where we error out and
2557 * the buffers are clean or may not have been joined to the transaction
2558 * and hence need to be released manually. If they have been joined to
2559 * the transaction, then xfs_trans_brelse() will handle them
2560 * appropriately based on the recursion count and dirty state of the
2563 * XXX (dgc): When we have lots of free space, does this buy us
2564 * anything other than extra overhead when we need to put more blocks
2565 * back on the free list? Maybe we should only do this when space is
2566 * getting low or the AGFL is more than half full?
2568 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2569 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2570 * updating the rmapbt. Both flags are used in xfs_repair while we're
2571 * rebuilding the rmapbt, and neither are used by the kernel. They're
2572 * both required to ensure that rmaps are correctly recorded for the
2573 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2574 * repair/rmap.c in xfsprogs for details.
2576 memset(&targs, 0, sizeof(targs));
2577 /* struct copy below */
2578 if (flags & XFS_ALLOC_FLAG_NORMAP)
2579 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2581 targs.oinfo = XFS_RMAP_OINFO_AG;
2582 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2583 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
2585 goto out_agbp_relse;
2587 /* defer agfl frees */
2588 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2594 targs.agno = args->agno;
2595 targs.alignment = targs.minlen = targs.prod = 1;
2596 targs.type = XFS_ALLOCTYPE_THIS_AG;
2598 error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
2600 goto out_agbp_relse;
2602 /* Make the freelist longer if it's too short. */
2603 while (pag->pagf_flcount < need) {
2605 targs.maxlen = need - pag->pagf_flcount;
2606 targs.resv = XFS_AG_RESV_AGFL;
2608 /* Allocate as many blocks as possible at once. */
2609 error = xfs_alloc_ag_vextent(&targs);
2611 goto out_agflbp_relse;
2614 * Stop if we run out. Won't happen if callers are obeying
2615 * the restrictions correctly. Can happen for free calls
2616 * on a completely full ag.
2618 if (targs.agbno == NULLAGBLOCK) {
2619 if (flags & XFS_ALLOC_FLAG_FREEING)
2621 goto out_agflbp_relse;
2624 * Put each allocated block on the list.
2626 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2627 error = xfs_alloc_put_freelist(tp, agbp,
2630 goto out_agflbp_relse;
2633 xfs_trans_brelse(tp, agflbp);
2638 xfs_trans_brelse(tp, agflbp);
2641 xfs_trans_brelse(tp, agbp);
2648 * Get a block from the freelist.
2649 * Returns with the buffer for the block gotten.
2652 xfs_alloc_get_freelist(
2653 xfs_trans_t *tp, /* transaction pointer */
2654 xfs_buf_t *agbp, /* buffer containing the agf structure */
2655 xfs_agblock_t *bnop, /* block address retrieved from freelist */
2656 int btreeblk) /* destination is a AGF btree */
2658 xfs_agf_t *agf; /* a.g. freespace structure */
2659 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
2660 xfs_agblock_t bno; /* block number returned */
2664 xfs_mount_t *mp = tp->t_mountp;
2665 xfs_perag_t *pag; /* per allocation group data */
2668 * Freelist is empty, give up.
2670 agf = XFS_BUF_TO_AGF(agbp);
2671 if (!agf->agf_flcount) {
2672 *bnop = NULLAGBLOCK;
2676 * Read the array of free blocks.
2678 error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2685 * Get the block number and update the data structures.
2687 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
2688 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2689 be32_add_cpu(&agf->agf_flfirst, 1);
2690 xfs_trans_brelse(tp, agflbp);
2691 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2692 agf->agf_flfirst = 0;
2694 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2695 ASSERT(!pag->pagf_agflreset);
2696 be32_add_cpu(&agf->agf_flcount, -1);
2697 xfs_trans_agflist_delta(tp, -1);
2698 pag->pagf_flcount--;
2700 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2702 be32_add_cpu(&agf->agf_btreeblks, 1);
2703 pag->pagf_btreeblks++;
2704 logflags |= XFS_AGF_BTREEBLKS;
2708 xfs_alloc_log_agf(tp, agbp, logflags);
2715 * Log the given fields from the agf structure.
2719 xfs_trans_t *tp, /* transaction pointer */
2720 xfs_buf_t *bp, /* buffer for a.g. freelist header */
2721 int fields) /* mask of fields to be logged (XFS_AGF_...) */
2723 int first; /* first byte offset */
2724 int last; /* last byte offset */
2725 static const short offsets[] = {
2726 offsetof(xfs_agf_t, agf_magicnum),
2727 offsetof(xfs_agf_t, agf_versionnum),
2728 offsetof(xfs_agf_t, agf_seqno),
2729 offsetof(xfs_agf_t, agf_length),
2730 offsetof(xfs_agf_t, agf_roots[0]),
2731 offsetof(xfs_agf_t, agf_levels[0]),
2732 offsetof(xfs_agf_t, agf_flfirst),
2733 offsetof(xfs_agf_t, agf_fllast),
2734 offsetof(xfs_agf_t, agf_flcount),
2735 offsetof(xfs_agf_t, agf_freeblks),
2736 offsetof(xfs_agf_t, agf_longest),
2737 offsetof(xfs_agf_t, agf_btreeblks),
2738 offsetof(xfs_agf_t, agf_uuid),
2739 offsetof(xfs_agf_t, agf_rmap_blocks),
2740 offsetof(xfs_agf_t, agf_refcount_blocks),
2741 offsetof(xfs_agf_t, agf_refcount_root),
2742 offsetof(xfs_agf_t, agf_refcount_level),
2743 /* needed so that we don't log the whole rest of the structure: */
2744 offsetof(xfs_agf_t, agf_spare64),
2748 trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
2750 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2752 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2753 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2757 * Interface for inode allocation to force the pag data to be initialized.
2760 xfs_alloc_pagf_init(
2761 xfs_mount_t *mp, /* file system mount structure */
2762 xfs_trans_t *tp, /* transaction pointer */
2763 xfs_agnumber_t agno, /* allocation group number */
2764 int flags) /* XFS_ALLOC_FLAGS_... */
2769 if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
2772 xfs_trans_brelse(tp, bp);
2777 * Put the block on the freelist for the allocation group.
2780 xfs_alloc_put_freelist(
2781 xfs_trans_t *tp, /* transaction pointer */
2782 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
2783 xfs_buf_t *agflbp,/* buffer for a.g. free block array */
2784 xfs_agblock_t bno, /* block being freed */
2785 int btreeblk) /* block came from a AGF btree */
2787 xfs_agf_t *agf; /* a.g. freespace structure */
2788 __be32 *blockp;/* pointer to array entry */
2791 xfs_mount_t *mp; /* mount structure */
2792 xfs_perag_t *pag; /* per allocation group data */
2796 agf = XFS_BUF_TO_AGF(agbp);
2799 if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2800 be32_to_cpu(agf->agf_seqno), &agflbp)))
2802 be32_add_cpu(&agf->agf_fllast, 1);
2803 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2804 agf->agf_fllast = 0;
2806 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2807 ASSERT(!pag->pagf_agflreset);
2808 be32_add_cpu(&agf->agf_flcount, 1);
2809 xfs_trans_agflist_delta(tp, 1);
2810 pag->pagf_flcount++;
2812 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2814 be32_add_cpu(&agf->agf_btreeblks, -1);
2815 pag->pagf_btreeblks--;
2816 logflags |= XFS_AGF_BTREEBLKS;
2820 xfs_alloc_log_agf(tp, agbp, logflags);
2822 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2824 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
2825 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2826 *blockp = cpu_to_be32(bno);
2827 startoff = (char *)blockp - (char *)agflbp->b_addr;
2829 xfs_alloc_log_agf(tp, agbp, logflags);
2831 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2832 xfs_trans_log_buf(tp, agflbp, startoff,
2833 startoff + sizeof(xfs_agblock_t) - 1);
2837 static xfs_failaddr_t
2841 struct xfs_mount *mp = bp->b_mount;
2842 struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
2844 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2845 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2846 return __this_address;
2847 if (!xfs_log_check_lsn(mp,
2848 be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
2849 return __this_address;
2852 if (!xfs_verify_magic(bp, agf->agf_magicnum))
2853 return __this_address;
2855 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2856 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2857 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2858 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2859 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2860 return __this_address;
2862 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2863 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2864 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
2865 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
2866 return __this_address;
2868 if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2869 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2870 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
2871 return __this_address;
2874 * during growfs operations, the perag is not fully initialised,
2875 * so we can't use it for any useful checking. growfs ensures we can't
2876 * use it by using uncached buffers that don't have the perag attached
2877 * so we can detect and avoid this problem.
2879 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2880 return __this_address;
2882 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
2883 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2884 return __this_address;
2886 if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2887 (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2888 be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
2889 return __this_address;
2896 xfs_agf_read_verify(
2899 struct xfs_mount *mp = bp->b_mount;
2902 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2903 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2904 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2906 fa = xfs_agf_verify(bp);
2907 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2908 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2913 xfs_agf_write_verify(
2916 struct xfs_mount *mp = bp->b_mount;
2917 struct xfs_buf_log_item *bip = bp->b_log_item;
2920 fa = xfs_agf_verify(bp);
2922 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2926 if (!xfs_sb_version_hascrc(&mp->m_sb))
2930 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2932 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2935 const struct xfs_buf_ops xfs_agf_buf_ops = {
2937 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
2938 .verify_read = xfs_agf_read_verify,
2939 .verify_write = xfs_agf_write_verify,
2940 .verify_struct = xfs_agf_verify,
2944 * Read in the allocation group header (free/alloc section).
2948 struct xfs_mount *mp, /* mount point structure */
2949 struct xfs_trans *tp, /* transaction pointer */
2950 xfs_agnumber_t agno, /* allocation group number */
2951 int flags, /* XFS_BUF_ */
2952 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2956 trace_xfs_read_agf(mp, agno);
2958 ASSERT(agno != NULLAGNUMBER);
2959 error = xfs_trans_read_buf(
2960 mp, tp, mp->m_ddev_targp,
2961 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2962 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2968 ASSERT(!(*bpp)->b_error);
2969 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2974 * Read in the allocation group header (free/alloc section).
2978 struct xfs_mount *mp, /* mount point structure */
2979 struct xfs_trans *tp, /* transaction pointer */
2980 xfs_agnumber_t agno, /* allocation group number */
2981 int flags, /* XFS_ALLOC_FLAG_... */
2982 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2984 struct xfs_agf *agf; /* ag freelist header */
2985 struct xfs_perag *pag; /* per allocation group data */
2988 trace_xfs_alloc_read_agf(mp, agno);
2990 ASSERT(agno != NULLAGNUMBER);
2991 error = xfs_read_agf(mp, tp, agno,
2992 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2998 ASSERT(!(*bpp)->b_error);
3000 agf = XFS_BUF_TO_AGF(*bpp);
3001 pag = xfs_perag_get(mp, agno);
3002 if (!pag->pagf_init) {
3003 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3004 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3005 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3006 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3007 pag->pagf_levels[XFS_BTNUM_BNOi] =
3008 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3009 pag->pagf_levels[XFS_BTNUM_CNTi] =
3010 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3011 pag->pagf_levels[XFS_BTNUM_RMAPi] =
3012 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3013 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3015 pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
3018 else if (!XFS_FORCED_SHUTDOWN(mp)) {
3019 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3020 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3021 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3022 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3023 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3024 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3025 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3026 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3034 * Allocate an extent (variable-size).
3035 * Depending on the allocation type, we either look in a single allocation
3036 * group or loop over the allocation groups to find the result.
3040 struct xfs_alloc_arg *args) /* allocation argument structure */
3042 xfs_agblock_t agsize; /* allocation group size */
3044 int flags; /* XFS_ALLOC_FLAG_... locking flags */
3045 struct xfs_mount *mp; /* mount structure pointer */
3046 xfs_agnumber_t sagno; /* starting allocation group number */
3047 xfs_alloctype_t type; /* input allocation type */
3049 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3052 type = args->otype = args->type;
3053 args->agbno = NULLAGBLOCK;
3055 * Just fix this up, for the case where the last a.g. is shorter
3056 * (or there's only one a.g.) and the caller couldn't easily figure
3057 * that out (xfs_bmap_alloc).
3059 agsize = mp->m_sb.sb_agblocks;
3060 if (args->maxlen > agsize)
3061 args->maxlen = agsize;
3062 if (args->alignment == 0)
3063 args->alignment = 1;
3064 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3065 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3066 ASSERT(args->minlen <= args->maxlen);
3067 ASSERT(args->minlen <= agsize);
3068 ASSERT(args->mod < args->prod);
3069 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3070 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3071 args->minlen > args->maxlen || args->minlen > agsize ||
3072 args->mod >= args->prod) {
3073 args->fsbno = NULLFSBLOCK;
3074 trace_xfs_alloc_vextent_badargs(args);
3079 case XFS_ALLOCTYPE_THIS_AG:
3080 case XFS_ALLOCTYPE_NEAR_BNO:
3081 case XFS_ALLOCTYPE_THIS_BNO:
3083 * These three force us into a single a.g.
3085 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3086 args->pag = xfs_perag_get(mp, args->agno);
3087 error = xfs_alloc_fix_freelist(args, 0);
3089 trace_xfs_alloc_vextent_nofix(args);
3093 trace_xfs_alloc_vextent_noagbp(args);
3096 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3097 if ((error = xfs_alloc_ag_vextent(args)))
3100 case XFS_ALLOCTYPE_START_BNO:
3102 * Try near allocation first, then anywhere-in-ag after
3103 * the first a.g. fails.
3105 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3106 (mp->m_flags & XFS_MOUNT_32BITINODES)) {
3107 args->fsbno = XFS_AGB_TO_FSB(mp,
3108 ((mp->m_agfrotor / rotorstep) %
3109 mp->m_sb.sb_agcount), 0);
3112 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3113 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3115 case XFS_ALLOCTYPE_FIRST_AG:
3117 * Rotate through the allocation groups looking for a winner.
3119 if (type == XFS_ALLOCTYPE_FIRST_AG) {
3121 * Start with allocation group given by bno.
3123 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3124 args->type = XFS_ALLOCTYPE_THIS_AG;
3129 * Start with the given allocation group.
3131 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3132 flags = XFS_ALLOC_FLAG_TRYLOCK;
3135 * Loop over allocation groups twice; first time with
3136 * trylock set, second time without.
3139 args->pag = xfs_perag_get(mp, args->agno);
3140 error = xfs_alloc_fix_freelist(args, flags);
3142 trace_xfs_alloc_vextent_nofix(args);
3146 * If we get a buffer back then the allocation will fly.
3149 if ((error = xfs_alloc_ag_vextent(args)))
3154 trace_xfs_alloc_vextent_loopfailed(args);
3157 * Didn't work, figure out the next iteration.
3159 if (args->agno == sagno &&
3160 type == XFS_ALLOCTYPE_START_BNO)
3161 args->type = XFS_ALLOCTYPE_THIS_AG;
3163 * For the first allocation, we can try any AG to get
3164 * space. However, if we already have allocated a
3165 * block, we don't want to try AGs whose number is below
3166 * sagno. Otherwise, we may end up with out-of-order
3167 * locking of AGF, which might cause deadlock.
3169 if (++(args->agno) == mp->m_sb.sb_agcount) {
3170 if (args->tp->t_firstblock != NULLFSBLOCK)
3176 * Reached the starting a.g., must either be done
3177 * or switch to non-trylock mode.
3179 if (args->agno == sagno) {
3181 args->agbno = NULLAGBLOCK;
3182 trace_xfs_alloc_vextent_allfailed(args);
3187 if (type == XFS_ALLOCTYPE_START_BNO) {
3188 args->agbno = XFS_FSB_TO_AGBNO(mp,
3190 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3193 xfs_perag_put(args->pag);
3196 if (args->agno == sagno)
3197 mp->m_agfrotor = (mp->m_agfrotor + 1) %
3198 (mp->m_sb.sb_agcount * rotorstep);
3200 mp->m_agfrotor = (args->agno * rotorstep + 1) %
3201 (mp->m_sb.sb_agcount * rotorstep);
3208 if (args->agbno == NULLAGBLOCK)
3209 args->fsbno = NULLFSBLOCK;
3211 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3213 ASSERT(args->len >= args->minlen);
3214 ASSERT(args->len <= args->maxlen);
3215 ASSERT(args->agbno % args->alignment == 0);
3216 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3221 xfs_perag_put(args->pag);
3224 xfs_perag_put(args->pag);
3228 /* Ensure that the freelist is at full capacity. */
3230 xfs_free_extent_fix_freelist(
3231 struct xfs_trans *tp,
3232 xfs_agnumber_t agno,
3233 struct xfs_buf **agbp)
3235 struct xfs_alloc_arg args;
3238 memset(&args, 0, sizeof(struct xfs_alloc_arg));
3240 args.mp = tp->t_mountp;
3244 * validate that the block number is legal - the enables us to detect
3245 * and handle a silent filesystem corruption rather than crashing.
3247 if (args.agno >= args.mp->m_sb.sb_agcount)
3248 return -EFSCORRUPTED;
3250 args.pag = xfs_perag_get(args.mp, args.agno);
3253 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3259 xfs_perag_put(args.pag);
3265 * Just break up the extent address and hand off to xfs_free_ag_extent
3266 * after fixing up the freelist.
3270 struct xfs_trans *tp,
3273 const struct xfs_owner_info *oinfo,
3274 enum xfs_ag_resv_type type,
3277 struct xfs_mount *mp = tp->t_mountp;
3278 struct xfs_buf *agbp;
3279 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
3280 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
3282 unsigned int busy_flags = 0;
3285 ASSERT(type != XFS_AG_RESV_AGFL);
3287 if (XFS_TEST_ERROR(false, mp,
3288 XFS_ERRTAG_FREE_EXTENT))
3291 error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
3295 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3296 error = -EFSCORRUPTED;
3300 /* validate the extent size is legal now we have the agf locked */
3301 if (XFS_IS_CORRUPT(mp,
3303 be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) {
3304 error = -EFSCORRUPTED;
3308 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3313 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3314 xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
3318 xfs_trans_brelse(tp, agbp);
3322 struct xfs_alloc_query_range_info {
3323 xfs_alloc_query_range_fn fn;
3327 /* Format btree record and pass to our callback. */
3329 xfs_alloc_query_range_helper(
3330 struct xfs_btree_cur *cur,
3331 union xfs_btree_rec *rec,
3334 struct xfs_alloc_query_range_info *query = priv;
3335 struct xfs_alloc_rec_incore irec;
3337 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3338 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3339 return query->fn(cur, &irec, query->priv);
3342 /* Find all free space within a given range of blocks. */
3344 xfs_alloc_query_range(
3345 struct xfs_btree_cur *cur,
3346 struct xfs_alloc_rec_incore *low_rec,
3347 struct xfs_alloc_rec_incore *high_rec,
3348 xfs_alloc_query_range_fn fn,
3351 union xfs_btree_irec low_brec;
3352 union xfs_btree_irec high_brec;
3353 struct xfs_alloc_query_range_info query;
3355 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3356 low_brec.a = *low_rec;
3357 high_brec.a = *high_rec;
3360 return xfs_btree_query_range(cur, &low_brec, &high_brec,
3361 xfs_alloc_query_range_helper, &query);
3364 /* Find all free space records. */
3366 xfs_alloc_query_all(
3367 struct xfs_btree_cur *cur,
3368 xfs_alloc_query_range_fn fn,
3371 struct xfs_alloc_query_range_info query;
3373 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3376 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3379 /* Is there a record covering a given extent? */
3381 xfs_alloc_has_record(
3382 struct xfs_btree_cur *cur,
3387 union xfs_btree_irec low;
3388 union xfs_btree_irec high;
3390 memset(&low, 0, sizeof(low));
3391 low.a.ar_startblock = bno;
3392 memset(&high, 0xFF, sizeof(high));
3393 high.a.ar_startblock = bno + len - 1;
3395 return xfs_btree_has_record(cur, &low, &high, exists);
3399 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
3400 * error code or XFS_ITER_*.
3404 struct xfs_mount *mp,
3405 struct xfs_agf *agf,
3406 struct xfs_buf *agflbp,
3407 xfs_agfl_walk_fn walk_fn,
3414 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
3415 i = be32_to_cpu(agf->agf_flfirst);
3417 /* Nothing to walk in an empty AGFL. */
3418 if (agf->agf_flcount == cpu_to_be32(0))
3421 /* Otherwise, walk from first to last, wrapping as needed. */
3423 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3426 if (i == be32_to_cpu(agf->agf_fllast))
3428 if (++i == xfs_agfl_size(mp))