]> Git Repo - linux.git/blame - fs/xfs/libxfs/xfs_ialloc.c
xfs: replace shouty XFS_BM{BT,DR} macros
[linux.git] / fs / xfs / libxfs / xfs_ialloc.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
7b718769
NS
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
1da177e4 5 */
1da177e4 6#include "xfs.h"
a844f451 7#include "xfs_fs.h"
70a9883c 8#include "xfs_shared.h"
239880ef
DC
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
a844f451 12#include "xfs_bit.h"
1da177e4 13#include "xfs_mount.h"
1da177e4 14#include "xfs_inode.h"
a844f451
NS
15#include "xfs_btree.h"
16#include "xfs_ialloc.h"
a4fbe6ab 17#include "xfs_ialloc_btree.h"
1da177e4 18#include "xfs_alloc.h"
e9e899a2 19#include "xfs_errortag.h"
1da177e4
LT
20#include "xfs_error.h"
21#include "xfs_bmap.h"
239880ef 22#include "xfs_trans.h"
983d09ff 23#include "xfs_buf_item.h"
ddf6ad01 24#include "xfs_icreate_item.h"
7bb85ef3 25#include "xfs_icache.h"
d123031a 26#include "xfs_trace.h"
a45086e2 27#include "xfs_log.h"
340785cc 28#include "xfs_rmap.h"
9bbafc71 29#include "xfs_ag.h"
de6077ec 30#include "xfs_health.h"
1da177e4 31
fe033cc8 32/*
21875505 33 * Lookup a record by ino in the btree given by cur.
fe033cc8 34 */
81e25176 35int /* error */
21875505 36xfs_inobt_lookup(
fe033cc8
CH
37 struct xfs_btree_cur *cur, /* btree cursor */
38 xfs_agino_t ino, /* starting inode of chunk */
21875505 39 xfs_lookup_t dir, /* <=, >=, == */
fe033cc8
CH
40 int *stat) /* success/failure */
41{
42 cur->bc_rec.i.ir_startino = ino;
5419040f
BF
43 cur->bc_rec.i.ir_holemask = 0;
44 cur->bc_rec.i.ir_count = 0;
21875505
CH
45 cur->bc_rec.i.ir_freecount = 0;
46 cur->bc_rec.i.ir_free = 0;
47 return xfs_btree_lookup(cur, dir, stat);
fe033cc8
CH
48}
49
278d0ca1 50/*
afabc24a 51 * Update the record referred to by cur to the value given.
278d0ca1
CH
52 * This either works (return 0) or gets an EFSCORRUPTED error.
53 */
54STATIC int /* error */
55xfs_inobt_update(
56 struct xfs_btree_cur *cur, /* btree cursor */
afabc24a 57 xfs_inobt_rec_incore_t *irec) /* btree record */
278d0ca1
CH
58{
59 union xfs_btree_rec rec;
60
afabc24a 61 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
38c26bfd 62 if (xfs_has_sparseinodes(cur->bc_mp)) {
5419040f
BF
63 rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
64 rec.inobt.ir_u.sp.ir_count = irec->ir_count;
65 rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
66 } else {
67 /* ir_holemask/ir_count not supported on-disk */
68 rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
69 }
afabc24a 70 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
278d0ca1
CH
71 return xfs_btree_update(cur, &rec);
72}
73
e936945e
DW
74/* Convert on-disk btree record to incore inobt record. */
75void
76xfs_inobt_btrec_to_irec(
77 struct xfs_mount *mp,
159eb69d 78 const union xfs_btree_rec *rec,
e936945e 79 struct xfs_inobt_rec_incore *irec)
8cc938fe 80{
5419040f 81 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
38c26bfd 82 if (xfs_has_sparseinodes(mp)) {
5419040f
BF
83 irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
84 irec->ir_count = rec->inobt.ir_u.sp.ir_count;
85 irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
86 } else {
87 /*
88 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
89 * values for full inode chunks.
90 */
91 irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
92 irec->ir_count = XFS_INODES_PER_CHUNK;
93 irec->ir_freecount =
94 be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
8cc938fe 95 }
5419040f 96 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
e936945e
DW
97}
98
dbfbf3bd
DW
99/* Compute the freecount of an incore inode record. */
100uint8_t
101xfs_inobt_rec_freecount(
102 const struct xfs_inobt_rec_incore *irec)
103{
104 uint64_t realfree = irec->ir_free;
105
106 if (xfs_inobt_issparse(irec->ir_holemask))
107 realfree &= xfs_inobt_irec_to_allocmask(irec);
108 return hweight64(realfree);
109}
110
366a0b8d
DW
111/* Simple checks for inode records. */
112xfs_failaddr_t
113xfs_inobt_check_irec(
dbfbf3bd 114 struct xfs_perag *pag,
366a0b8d
DW
115 const struct xfs_inobt_rec_incore *irec)
116{
de1a9ce2 117 /* Record has to be properly aligned within the AG. */
dbfbf3bd 118 if (!xfs_verify_agino(pag, irec->ir_startino))
366a0b8d 119 return __this_address;
dbfbf3bd 120 if (!xfs_verify_agino(pag,
de1a9ce2
DW
121 irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
122 return __this_address;
366a0b8d
DW
123 if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
124 irec->ir_count > XFS_INODES_PER_CHUNK)
125 return __this_address;
126 if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
127 return __this_address;
128
dbfbf3bd 129 if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
366a0b8d
DW
130 return __this_address;
131
132 return NULL;
133}
134
ee12eaaa
DW
135static inline int
136xfs_inobt_complain_bad_rec(
137 struct xfs_btree_cur *cur,
138 xfs_failaddr_t fa,
139 const struct xfs_inobt_rec_incore *irec)
140{
141 struct xfs_mount *mp = cur->bc_mp;
142
143 xfs_warn(mp,
77953b97
CH
144 "%sbt record corruption in AG %d detected at %pS!",
145 cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
ee12eaaa
DW
146 xfs_warn(mp,
147"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
148 irec->ir_startino, irec->ir_count, irec->ir_freecount,
149 irec->ir_free, irec->ir_holemask);
a78d10f4 150 xfs_btree_mark_sick(cur);
ee12eaaa
DW
151 return -EFSCORRUPTED;
152}
153
e936945e
DW
154/*
155 * Get the data from the pointed-to record.
156 */
157int
158xfs_inobt_get_rec(
159 struct xfs_btree_cur *cur,
160 struct xfs_inobt_rec_incore *irec,
161 int *stat)
162{
9e6c08d4 163 struct xfs_mount *mp = cur->bc_mp;
e936945e 164 union xfs_btree_rec *rec;
366a0b8d 165 xfs_failaddr_t fa;
e936945e
DW
166 int error;
167
168 error = xfs_btree_get_rec(cur, &rec, stat);
169 if (error || *stat == 0)
170 return error;
171
9e6c08d4 172 xfs_inobt_btrec_to_irec(mp, rec, irec);
dbfbf3bd 173 fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
366a0b8d 174 if (fa)
ee12eaaa 175 return xfs_inobt_complain_bad_rec(cur, fa, irec);
5419040f
BF
176
177 return 0;
8cc938fe
CH
178}
179
0aa0a756
BF
180/*
181 * Insert a single inobt record. Cursor must already point to desired location.
182 */
7f8f1313 183int
0aa0a756
BF
184xfs_inobt_insert_rec(
185 struct xfs_btree_cur *cur,
c8ce540d
DW
186 uint16_t holemask,
187 uint8_t count,
188 int32_t freecount,
0aa0a756
BF
189 xfs_inofree_t free,
190 int *stat)
191{
5419040f
BF
192 cur->bc_rec.i.ir_holemask = holemask;
193 cur->bc_rec.i.ir_count = count;
0aa0a756
BF
194 cur->bc_rec.i.ir_freecount = freecount;
195 cur->bc_rec.i.ir_free = free;
196 return xfs_btree_insert(cur, stat);
197}
198
199/*
200 * Insert records describing a newly allocated inode chunk into the inobt.
201 */
202STATIC int
203xfs_inobt_insert(
dedab3e4 204 struct xfs_perag *pag,
0aa0a756
BF
205 struct xfs_trans *tp,
206 struct xfs_buf *agbp,
207 xfs_agino_t newino,
208 xfs_agino_t newlen,
fbeef4e0 209 bool is_finobt)
0aa0a756
BF
210{
211 struct xfs_btree_cur *cur;
0aa0a756
BF
212 xfs_agino_t thisino;
213 int i;
214 int error;
215
fbeef4e0 216 if (is_finobt)
14dd46cf
CH
217 cur = xfs_finobt_init_cursor(pag, tp, agbp);
218 else
219 cur = xfs_inobt_init_cursor(pag, tp, agbp);
0aa0a756
BF
220
221 for (thisino = newino;
222 thisino < newino + newlen;
223 thisino += XFS_INODES_PER_CHUNK) {
224 error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
225 if (error) {
226 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
227 return error;
228 }
229 ASSERT(i == 0);
230
5419040f
BF
231 error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
232 XFS_INODES_PER_CHUNK,
233 XFS_INODES_PER_CHUNK,
0aa0a756
BF
234 XFS_INOBT_ALL_FREE, &i);
235 if (error) {
236 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
237 return error;
238 }
239 ASSERT(i == 1);
240 }
241
242 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
243
244 return 0;
245}
246
0b48db80
DC
247/*
248 * Verify that the number of free inodes in the AGI is correct.
249 */
250#ifdef DEBUG
9ba0889e 251static int
0b48db80 252xfs_check_agi_freecount(
9ba0889e 253 struct xfs_btree_cur *cur)
0b48db80
DC
254{
255 if (cur->bc_nlevels == 1) {
256 xfs_inobt_rec_incore_t rec;
257 int freecount = 0;
258 int error;
259 int i;
260
21875505 261 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
0b48db80
DC
262 if (error)
263 return error;
264
265 do {
266 error = xfs_inobt_get_rec(cur, &rec, &i);
267 if (error)
268 return error;
269
270 if (i) {
271 freecount += rec.ir_freecount;
272 error = xfs_btree_increment(cur, 0, &i);
273 if (error)
274 return error;
275 }
276 } while (i == 1);
277
75c8c50f 278 if (!xfs_is_shutdown(cur->bc_mp))
9ba0889e 279 ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
0b48db80
DC
280 }
281 return 0;
282}
283#else
9ba0889e 284#define xfs_check_agi_freecount(cur) 0
0b48db80
DC
285#endif
286
85c0b2ab 287/*
28c8e41a
DC
288 * Initialise a new set of inodes. When called without a transaction context
289 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
290 * than logging them (which in a transaction context puts them into the AIL
291 * for writeback rather than the xfsbufd queue).
85c0b2ab 292 */
ddf6ad01 293int
85c0b2ab
DC
294xfs_ialloc_inode_init(
295 struct xfs_mount *mp,
296 struct xfs_trans *tp,
28c8e41a 297 struct list_head *buffer_list,
463958af 298 int icount,
85c0b2ab
DC
299 xfs_agnumber_t agno,
300 xfs_agblock_t agbno,
301 xfs_agblock_t length,
302 unsigned int gen)
303{
304 struct xfs_buf *fbuf;
305 struct xfs_dinode *free;
83dcdb44 306 int nbufs;
85c0b2ab
DC
307 int version;
308 int i, j;
309 xfs_daddr_t d;
93848a99 310 xfs_ino_t ino = 0;
ce92464c 311 int error;
85c0b2ab
DC
312
313 /*
6e0c7b8c
JL
314 * Loop over the new block(s), filling in the inodes. For small block
315 * sizes, manipulate the inodes in buffers which are multiples of the
316 * blocks size.
85c0b2ab 317 */
ef325959 318 nbufs = length / M_IGEO(mp)->blocks_per_cluster;
85c0b2ab
DC
319
320 /*
93848a99
CH
321 * Figure out what version number to use in the inodes we create. If
322 * the superblock version has caught up to the one that supports the new
323 * inode format, then use the new inode version. Otherwise use the old
324 * version so that old kernels will continue to be able to use the file
325 * system.
326 *
327 * For v3 inodes, we also need to write the inode number into the inode,
328 * so calculate the first inode number of the chunk here as
43004b2a 329 * XFS_AGB_TO_AGINO() only works within a filesystem block, not
93848a99
CH
330 * across multiple filesystem blocks (such as a cluster) and so cannot
331 * be used in the cluster buffer loop below.
332 *
333 * Further, because we are writing the inode directly into the buffer
334 * and calculating a CRC on the entire inode, we have ot log the entire
335 * inode so that the entire range the CRC covers is present in the log.
336 * That means for v3 inode we log the entire buffer rather than just the
337 * inode cores.
85c0b2ab 338 */
ebd9027d 339 if (xfs_has_v3inodes(mp)) {
93848a99 340 version = 3;
43004b2a 341 ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
ddf6ad01
DC
342
343 /*
344 * log the initialisation that is about to take place as an
345 * logical operation. This means the transaction does not
346 * need to log the physical changes to the inode buffers as log
347 * recovery will know what initialisation is actually needed.
348 * Hence we only need to log the buffers as "ordered" buffers so
349 * they track in the AIL as if they were physically logged.
350 */
351 if (tp)
463958af 352 xfs_icreate_log(tp, agno, agbno, icount,
ddf6ad01 353 mp->m_sb.sb_inodesize, length, gen);
263997a6 354 } else
85c0b2ab 355 version = 2;
85c0b2ab
DC
356
357 for (j = 0; j < nbufs; j++) {
358 /*
359 * Get the block.
360 */
83dcdb44 361 d = XFS_AGB_TO_DADDR(mp, agno, agbno +
ef325959 362 (j * M_IGEO(mp)->blocks_per_cluster));
ce92464c
DW
363 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
364 mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
365 XBF_UNMAPPED, &fbuf);
366 if (error)
367 return error;
ddf6ad01
DC
368
369 /* Initialize the inode buffers and log them appropriately. */
1813dd64 370 fbuf->b_ops = &xfs_inode_buf_ops;
93848a99 371 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
ef325959 372 for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
85c0b2ab 373 int ioffset = i << mp->m_sb.sb_inodelog;
85c0b2ab
DC
374
375 free = xfs_make_iptr(mp, fbuf, i);
376 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
377 free->di_version = version;
378 free->di_gen = cpu_to_be32(gen);
379 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
93848a99
CH
380
381 if (version == 3) {
382 free->di_ino = cpu_to_be64(ino);
383 ino++;
ce748eaa
ES
384 uuid_copy(&free->di_uuid,
385 &mp->m_sb.sb_meta_uuid);
93848a99 386 xfs_dinode_calc_crc(mp, free);
28c8e41a 387 } else if (tp) {
93848a99
CH
388 /* just log the inode core */
389 xfs_trans_log_buf(tp, fbuf, ioffset,
cf28e17c 390 ioffset + XFS_DINODE_SIZE(mp) - 1);
93848a99
CH
391 }
392 }
28c8e41a
DC
393
394 if (tp) {
395 /*
396 * Mark the buffer as an inode allocation buffer so it
397 * sticks in AIL at the point of this allocation
398 * transaction. This ensures the they are on disk before
399 * the tail of the log can be moved past this
400 * transaction (i.e. by preventing relogging from moving
401 * it forward in the log).
402 */
403 xfs_trans_inode_alloc_buf(tp, fbuf);
404 if (version == 3) {
ddf6ad01
DC
405 /*
406 * Mark the buffer as ordered so that they are
407 * not physically logged in the transaction but
408 * still tracked in the AIL as part of the
409 * transaction and pin the log appropriately.
410 */
411 xfs_trans_ordered_buf(tp, fbuf);
28c8e41a
DC
412 }
413 } else {
414 fbuf->b_flags |= XBF_DONE;
415 xfs_buf_delwri_queue(fbuf, buffer_list);
416 xfs_buf_relse(fbuf);
85c0b2ab 417 }
85c0b2ab 418 }
2a30f36d 419 return 0;
85c0b2ab
DC
420}
421
56d1115c
BF
422/*
423 * Align startino and allocmask for a recently allocated sparse chunk such that
424 * they are fit for insertion (or merge) into the on-disk inode btrees.
425 *
426 * Background:
427 *
428 * When enabled, sparse inode support increases the inode alignment from cluster
429 * size to inode chunk size. This means that the minimum range between two
430 * non-adjacent inode records in the inobt is large enough for a full inode
431 * record. This allows for cluster sized, cluster aligned block allocation
432 * without need to worry about whether the resulting inode record overlaps with
433 * another record in the tree. Without this basic rule, we would have to deal
434 * with the consequences of overlap by potentially undoing recent allocations in
435 * the inode allocation codepath.
436 *
437 * Because of this alignment rule (which is enforced on mount), there are two
438 * inobt possibilities for newly allocated sparse chunks. One is that the
439 * aligned inode record for the chunk covers a range of inodes not already
440 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
441 * other is that a record already exists at the aligned startino that considers
442 * the newly allocated range as sparse. In the latter case, record content is
443 * merged in hope that sparse inode chunks fill to full chunks over time.
444 */
445STATIC void
446xfs_align_sparse_ino(
447 struct xfs_mount *mp,
448 xfs_agino_t *startino,
449 uint16_t *allocmask)
450{
451 xfs_agblock_t agbno;
452 xfs_agblock_t mod;
453 int offset;
454
455 agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
456 mod = agbno % mp->m_sb.sb_inoalignmt;
457 if (!mod)
458 return;
459
460 /* calculate the inode offset and align startino */
43004b2a 461 offset = XFS_AGB_TO_AGINO(mp, mod);
56d1115c
BF
462 *startino -= offset;
463
464 /*
465 * Since startino has been aligned down, left shift allocmask such that
466 * it continues to represent the same physical inodes relative to the
467 * new startino.
468 */
469 *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
470}
471
472/*
473 * Determine whether the source inode record can merge into the target. Both
474 * records must be sparse, the inode ranges must match and there must be no
475 * allocation overlap between the records.
476 */
477STATIC bool
478__xfs_inobt_can_merge(
479 struct xfs_inobt_rec_incore *trec, /* tgt record */
480 struct xfs_inobt_rec_incore *srec) /* src record */
481{
482 uint64_t talloc;
483 uint64_t salloc;
484
485 /* records must cover the same inode range */
486 if (trec->ir_startino != srec->ir_startino)
487 return false;
488
489 /* both records must be sparse */
490 if (!xfs_inobt_issparse(trec->ir_holemask) ||
491 !xfs_inobt_issparse(srec->ir_holemask))
492 return false;
493
494 /* both records must track some inodes */
495 if (!trec->ir_count || !srec->ir_count)
496 return false;
497
498 /* can't exceed capacity of a full record */
499 if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
500 return false;
501
502 /* verify there is no allocation overlap */
503 talloc = xfs_inobt_irec_to_allocmask(trec);
504 salloc = xfs_inobt_irec_to_allocmask(srec);
505 if (talloc & salloc)
506 return false;
507
508 return true;
509}
510
511/*
512 * Merge the source inode record into the target. The caller must call
513 * __xfs_inobt_can_merge() to ensure the merge is valid.
514 */
515STATIC void
516__xfs_inobt_rec_merge(
517 struct xfs_inobt_rec_incore *trec, /* target */
518 struct xfs_inobt_rec_incore *srec) /* src */
519{
520 ASSERT(trec->ir_startino == srec->ir_startino);
521
522 /* combine the counts */
523 trec->ir_count += srec->ir_count;
524 trec->ir_freecount += srec->ir_freecount;
525
526 /*
527 * Merge the holemask and free mask. For both fields, 0 bits refer to
528 * allocated inodes. We combine the allocated ranges with bitwise AND.
529 */
530 trec->ir_holemask &= srec->ir_holemask;
531 trec->ir_free &= srec->ir_free;
532}
533
534/*
8541a7d9
CH
535 * Insert a new sparse inode chunk into the associated inode allocation btree.
536 * The inode record for the sparse chunk is pre-aligned to a startino that
537 * should match any pre-existing sparse inode record in the tree. This allows
538 * sparse chunks to fill over time.
56d1115c 539 *
8541a7d9
CH
540 * If no preexisting record exists, the provided record is inserted.
541 * If there is a preexisting record, the provided record is merged with the
56d1115c 542 * existing record and updated in place. The merged record is returned in nrec.
56d1115c
BF
543 *
544 * It is considered corruption if a merge is requested and not possible. Given
545 * the sparse inode alignment constraints, this should never happen.
546 */
547STATIC int
548xfs_inobt_insert_sprec(
dedab3e4 549 struct xfs_perag *pag,
56d1115c
BF
550 struct xfs_trans *tp,
551 struct xfs_buf *agbp,
8541a7d9 552 struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
56d1115c 553{
dedab3e4 554 struct xfs_mount *mp = pag->pag_mount;
56d1115c 555 struct xfs_btree_cur *cur;
56d1115c
BF
556 int error;
557 int i;
558 struct xfs_inobt_rec_incore rec;
559
14dd46cf 560 cur = xfs_inobt_init_cursor(pag, tp, agbp);
56d1115c
BF
561
562 /* the new record is pre-aligned so we know where to look */
563 error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
564 if (error)
565 goto error;
566 /* if nothing there, insert a new record and return */
567 if (i == 0) {
568 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
569 nrec->ir_count, nrec->ir_freecount,
570 nrec->ir_free, &i);
571 if (error)
572 goto error;
f9e03706 573 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 574 xfs_btree_mark_sick(cur);
f9e03706
DW
575 error = -EFSCORRUPTED;
576 goto error;
577 }
56d1115c
BF
578
579 goto out;
580 }
581
582 /*
8541a7d9 583 * A record exists at this startino. Merge the records.
56d1115c 584 */
8541a7d9
CH
585 error = xfs_inobt_get_rec(cur, &rec, &i);
586 if (error)
587 goto error;
588 if (XFS_IS_CORRUPT(mp, i != 1)) {
589 xfs_btree_mark_sick(cur);
590 error = -EFSCORRUPTED;
591 goto error;
592 }
593 if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
594 xfs_btree_mark_sick(cur);
595 error = -EFSCORRUPTED;
596 goto error;
597 }
56d1115c 598
8541a7d9
CH
599 /*
600 * This should never fail. If we have coexisting records that
601 * cannot merge, something is seriously wrong.
602 */
603 if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
604 xfs_btree_mark_sick(cur);
605 error = -EFSCORRUPTED;
606 goto error;
607 }
56d1115c 608
8541a7d9
CH
609 trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
610 rec.ir_holemask, nrec->ir_startino,
611 nrec->ir_holemask);
56d1115c 612
8541a7d9
CH
613 /* merge to nrec to output the updated record */
614 __xfs_inobt_rec_merge(nrec, &rec);
56d1115c 615
8541a7d9
CH
616 trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
617 nrec->ir_holemask);
56d1115c 618
8541a7d9
CH
619 error = xfs_inobt_rec_check_count(mp, nrec);
620 if (error)
621 goto error;
56d1115c
BF
622
623 error = xfs_inobt_update(cur, nrec);
624 if (error)
625 goto error;
626
627out:
628 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
629 return 0;
630error:
631 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
632 return error;
633}
634
8541a7d9
CH
635/*
636 * Insert a new sparse inode chunk into the free inode btree. The inode
637 * record for the sparse chunk is pre-aligned to a startino that should match
638 * any pre-existing sparse inode record in the tree. This allows sparse chunks
639 * to fill over time.
640 *
641 * The new record is always inserted, overwriting a pre-existing record if
642 * there is one.
643 */
644STATIC int
645xfs_finobt_insert_sprec(
646 struct xfs_perag *pag,
647 struct xfs_trans *tp,
648 struct xfs_buf *agbp,
649 struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */
650{
651 struct xfs_mount *mp = pag->pag_mount;
652 struct xfs_btree_cur *cur;
653 int error;
654 int i;
655
14dd46cf 656 cur = xfs_finobt_init_cursor(pag, tp, agbp);
8541a7d9
CH
657
658 /* the new record is pre-aligned so we know where to look */
659 error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
660 if (error)
661 goto error;
662 /* if nothing there, insert a new record and return */
663 if (i == 0) {
664 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
665 nrec->ir_count, nrec->ir_freecount,
666 nrec->ir_free, &i);
667 if (error)
668 goto error;
669 if (XFS_IS_CORRUPT(mp, i != 1)) {
670 xfs_btree_mark_sick(cur);
671 error = -EFSCORRUPTED;
672 goto error;
673 }
674 } else {
675 error = xfs_inobt_update(cur, nrec);
676 if (error)
677 goto error;
678 }
679
680 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
681 return 0;
682error:
683 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
684 return error;
685}
686
687
1da177e4 688/*
8237fbf5
DC
689 * Allocate new inodes in the allocation group specified by agbp. Returns 0 if
690 * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
691 * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
692 * inode count threshold, or the usual negative error code for other errors.
1da177e4 693 */
ef325959 694STATIC int
1da177e4 695xfs_ialloc_ag_alloc(
dedab3e4 696 struct xfs_perag *pag,
ef325959 697 struct xfs_trans *tp,
dedab3e4 698 struct xfs_buf *agbp)
1da177e4 699{
ef325959
DW
700 struct xfs_agi *agi;
701 struct xfs_alloc_arg args;
ef325959
DW
702 int error;
703 xfs_agino_t newino; /* new first inode's number */
704 xfs_agino_t newlen; /* new number of inodes */
705 int isaligned = 0; /* inode allocation at stripe */
706 /* unit boundary */
707 /* init. to full chunk */
56d1115c 708 struct xfs_inobt_rec_incore rec;
ef325959 709 struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
7b13c515 710 uint16_t allocmask = (uint16_t) -1;
ef325959 711 int do_sparse = 0;
1cdadee1 712
a0041684 713 memset(&args, 0, sizeof(args));
1da177e4
LT
714 args.tp = tp;
715 args.mp = tp->t_mountp;
1cdadee1 716 args.fsbno = NULLFSBLOCK;
7280feda 717 args.oinfo = XFS_RMAP_OINFO_INODES;
74c36a86 718 args.pag = pag;
1da177e4 719
46fc58da
BF
720#ifdef DEBUG
721 /* randomly do sparse inode allocations */
ebd9027d 722 if (xfs_has_sparseinodes(tp->t_mountp) &&
ef325959 723 igeo->ialloc_min_blks < igeo->ialloc_blks)
8032bf12 724 do_sparse = get_random_u32_below(2);
46fc58da
BF
725#endif
726
1da177e4
LT
727 /*
728 * Locking will ensure that we don't have two callers in here
729 * at one time.
730 */
ef325959
DW
731 newlen = igeo->ialloc_inos;
732 if (igeo->maxicount &&
74f9ce1c 733 percpu_counter_read_positive(&args.mp->m_icount) + newlen >
ef325959 734 igeo->maxicount)
2451337d 735 return -ENOSPC;
ef325959 736 args.minlen = args.maxlen = igeo->ialloc_blks;
1da177e4 737 /*
3ccb8b5f
GO
738 * First try to allocate inodes contiguous with the last-allocated
739 * chunk of inodes. If the filesystem is striped, this will fill
740 * an entire stripe unit with inodes.
28c8e41a 741 */
370c782b 742 agi = agbp->b_addr;
3ccb8b5f 743 newino = be32_to_cpu(agi->agi_newino);
019ff2d5 744 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
ef325959 745 igeo->ialloc_blks;
1cdadee1
BF
746 if (do_sparse)
747 goto sparse_alloc;
019ff2d5
NS
748 if (likely(newino != NULLAGINO &&
749 (args.agbno < be32_to_cpu(agi->agi_length)))) {
3ccb8b5f 750 args.prod = 1;
75de2a91 751
3ccb8b5f 752 /*
75de2a91
DC
753 * We need to take into account alignment here to ensure that
754 * we don't modify the free list if we fail to have an exact
755 * block. If we don't have an exact match, and every oher
756 * attempt allocation attempt fails, we'll end up cancelling
757 * a dirty transaction and shutting down.
758 *
759 * For an exact allocation, alignment must be 1,
760 * however we need to take cluster alignment into account when
761 * fixing up the freelist. Use the minalignslop field to
762 * indicate that extra blocks might be required for alignment,
763 * but not to use them in the actual exact allocation.
3ccb8b5f 764 */
75de2a91 765 args.alignment = 1;
ef325959 766 args.minalignslop = igeo->cluster_align - 1;
75de2a91
DC
767
768 /* Allow space for the inode btree to split. */
657f1019 769 args.minleft = igeo->inobt_maxlevels;
5f36b2ce
DC
770 error = xfs_alloc_vextent_exact_bno(&args,
771 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
772 args.agbno));
74c36a86 773 if (error)
3ccb8b5f 774 return error;
e480a723
BF
775
776 /*
777 * This request might have dirtied the transaction if the AG can
778 * satisfy the request, but the exact block was not available.
779 * If the allocation did fail, subsequent requests will relax
780 * the exact agbno requirement and increase the alignment
781 * instead. It is critical that the total size of the request
782 * (len + alignment + slop) does not increase from this point
783 * on, so reset minalignslop to ensure it is not included in
784 * subsequent requests.
785 */
786 args.minalignslop = 0;
1cdadee1 787 }
1da177e4 788
3ccb8b5f
GO
789 if (unlikely(args.fsbno == NULLFSBLOCK)) {
790 /*
791 * Set the alignment for the allocation.
792 * If stripe alignment is turned on then align at stripe unit
793 * boundary.
019ff2d5
NS
794 * If the cluster size is smaller than a filesystem block
795 * then we're doing I/O for inodes in filesystem block size
3ccb8b5f
GO
796 * pieces, so don't need alignment anyway.
797 */
798 isaligned = 0;
ef325959 799 if (igeo->ialloc_align) {
0560f31a 800 ASSERT(!xfs_has_noalign(args.mp));
3ccb8b5f
GO
801 args.alignment = args.mp->m_dalign;
802 isaligned = 1;
75de2a91 803 } else
ef325959 804 args.alignment = igeo->cluster_align;
3ccb8b5f
GO
805 /*
806 * Allocate a fixed-size extent of inodes.
807 */
3ccb8b5f
GO
808 args.prod = 1;
809 /*
810 * Allow space for the inode btree to split.
811 */
657f1019 812 args.minleft = igeo->inobt_maxlevels;
db4710fd
DC
813 error = xfs_alloc_vextent_near_bno(&args,
814 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
815 be32_to_cpu(agi->agi_root)));
74c36a86 816 if (error)
3ccb8b5f
GO
817 return error;
818 }
019ff2d5 819
1da177e4
LT
820 /*
821 * If stripe alignment is turned on, then try again with cluster
822 * alignment.
823 */
824 if (isaligned && args.fsbno == NULLFSBLOCK) {
ef325959 825 args.alignment = igeo->cluster_align;
db4710fd
DC
826 error = xfs_alloc_vextent_near_bno(&args,
827 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
828 be32_to_cpu(agi->agi_root)));
829 if (error)
1da177e4
LT
830 return error;
831 }
832
56d1115c
BF
833 /*
834 * Finally, try a sparse allocation if the filesystem supports it and
835 * the sparse allocation length is smaller than a full chunk.
836 */
ebd9027d 837 if (xfs_has_sparseinodes(args.mp) &&
ef325959 838 igeo->ialloc_min_blks < igeo->ialloc_blks &&
56d1115c 839 args.fsbno == NULLFSBLOCK) {
1cdadee1 840sparse_alloc:
56d1115c
BF
841 args.alignment = args.mp->m_sb.sb_spino_align;
842 args.prod = 1;
843
ef325959 844 args.minlen = igeo->ialloc_min_blks;
56d1115c
BF
845 args.maxlen = args.minlen;
846
847 /*
848 * The inode record will be aligned to full chunk size. We must
849 * prevent sparse allocation from AG boundaries that result in
850 * invalid inode records, such as records that start at agbno 0
851 * or extend beyond the AG.
852 *
853 * Set min agbno to the first aligned, non-zero agbno and max to
854 * the last aligned agbno that is at least one full chunk from
855 * the end of the AG.
856 */
857 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
858 args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
859 args.mp->m_sb.sb_inoalignmt) -
ef325959 860 igeo->ialloc_blks;
56d1115c 861
db4710fd
DC
862 error = xfs_alloc_vextent_near_bno(&args,
863 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
864 be32_to_cpu(agi->agi_root)));
56d1115c
BF
865 if (error)
866 return error;
867
43004b2a 868 newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
46fc58da 869 ASSERT(newlen <= XFS_INODES_PER_CHUNK);
56d1115c
BF
870 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
871 }
872
3937493c 873 if (args.fsbno == NULLFSBLOCK)
8237fbf5 874 return -EAGAIN;
3937493c 875
1da177e4 876 ASSERT(args.len == args.minlen);
1da177e4 877
359346a9 878 /*
85c0b2ab
DC
879 * Stamp and write the inode buffers.
880 *
359346a9
DC
881 * Seed the new inode cluster with a random generation number. This
882 * prevents short-term reuse of generation numbers if a chunk is
883 * freed and then immediately reallocated. We use random numbers
884 * rather than a linear progression to prevent the next generation
885 * number from being easily guessable.
886 */
7b13c515 887 error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
a251c17a 888 args.agbno, args.len, get_random_u32());
d42f08f6 889
2a30f36d
CS
890 if (error)
891 return error;
85c0b2ab
DC
892 /*
893 * Convert the results.
894 */
43004b2a 895 newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
56d1115c
BF
896
897 if (xfs_inobt_issparse(~allocmask)) {
898 /*
899 * We've allocated a sparse chunk. Align the startino and mask.
900 */
901 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
902
903 rec.ir_startino = newino;
904 rec.ir_holemask = ~allocmask;
905 rec.ir_count = newlen;
906 rec.ir_freecount = newlen;
907 rec.ir_free = XFS_INOBT_ALL_FREE;
908
909 /*
910 * Insert the sparse record into the inobt and allow for a merge
911 * if necessary. If a merge does occur, rec is updated to the
912 * merged record.
913 */
8541a7d9 914 error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
56d1115c
BF
915 if (error == -EFSCORRUPTED) {
916 xfs_alert(args.mp,
917 "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
7b13c515 918 XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
56d1115c
BF
919 rec.ir_startino),
920 rec.ir_holemask, rec.ir_count);
921 xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
922 }
923 if (error)
924 return error;
925
926 /*
927 * We can't merge the part we've just allocated as for the inobt
928 * due to finobt semantics. The original record may or may not
929 * exist independent of whether physical inodes exist in this
930 * sparse chunk.
931 *
932 * We must update the finobt record based on the inobt record.
933 * rec contains the fully merged and up to date inobt record
934 * from the previous call. Set merge false to replace any
935 * existing record with this one.
936 */
ebd9027d 937 if (xfs_has_finobt(args.mp)) {
8541a7d9 938 error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
56d1115c
BF
939 if (error)
940 return error;
941 }
942 } else {
943 /* full chunk - insert new records to both btrees */
fbeef4e0 944 error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
56d1115c
BF
945 if (error)
946 return error;
947
ebd9027d 948 if (xfs_has_finobt(args.mp)) {
dedab3e4 949 error = xfs_inobt_insert(pag, tp, agbp, newino,
fbeef4e0 950 newlen, true);
56d1115c
BF
951 if (error)
952 return error;
953 }
954 }
955
956 /*
957 * Update AGI counts and newino.
958 */
413d57c9
MS
959 be32_add_cpu(&agi->agi_count, newlen);
960 be32_add_cpu(&agi->agi_freecount, newlen);
44b56e0a 961 pag->pagi_freecount += newlen;
89e9b5c0 962 pag->pagi_count += newlen;
16259e7d 963 agi->agi_newino = cpu_to_be32(newino);
85c0b2ab 964
1da177e4
LT
965 /*
966 * Log allocation group header fields
967 */
968 xfs_ialloc_log_agi(tp, agbp,
969 XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
970 /*
971 * Modify/log superblock values for inode count and inode free count.
972 */
973 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
974 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
1da177e4
LT
975 return 0;
976}
977
4254b0bb
CH
978/*
979 * Try to retrieve the next record to the left/right from the current one.
980 */
981STATIC int
982xfs_ialloc_next_rec(
983 struct xfs_btree_cur *cur,
984 xfs_inobt_rec_incore_t *rec,
985 int *done,
986 int left)
987{
988 int error;
989 int i;
990
991 if (left)
992 error = xfs_btree_decrement(cur, 0, &i);
993 else
994 error = xfs_btree_increment(cur, 0, &i);
995
996 if (error)
997 return error;
998 *done = !i;
999 if (i) {
1000 error = xfs_inobt_get_rec(cur, rec, &i);
1001 if (error)
1002 return error;
989d5ec3
DW
1003 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1004 xfs_btree_mark_sick(cur);
f9e03706 1005 return -EFSCORRUPTED;
989d5ec3 1006 }
4254b0bb
CH
1007 }
1008
1009 return 0;
1010}
1011
bd169565
DC
1012STATIC int
1013xfs_ialloc_get_rec(
1014 struct xfs_btree_cur *cur,
1015 xfs_agino_t agino,
1016 xfs_inobt_rec_incore_t *rec,
43df2ee6 1017 int *done)
bd169565
DC
1018{
1019 int error;
1020 int i;
1021
1022 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
1023 if (error)
1024 return error;
1025 *done = !i;
1026 if (i) {
1027 error = xfs_inobt_get_rec(cur, rec, &i);
1028 if (error)
1029 return error;
989d5ec3
DW
1030 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1031 xfs_btree_mark_sick(cur);
f9e03706 1032 return -EFSCORRUPTED;
989d5ec3 1033 }
bd169565
DC
1034 }
1035
1036 return 0;
1037}
0b48db80 1038
d4cc540b 1039/*
26dd5217
BF
1040 * Return the offset of the first free inode in the record. If the inode chunk
1041 * is sparsely allocated, we convert the record holemask to inode granularity
1042 * and mask off the unallocated regions from the inode free mask.
d4cc540b
BF
1043 */
1044STATIC int
1045xfs_inobt_first_free_inode(
1046 struct xfs_inobt_rec_incore *rec)
1047{
26dd5217
BF
1048 xfs_inofree_t realfree;
1049
1050 /* if there are no holes, return the first available offset */
1051 if (!xfs_inobt_issparse(rec->ir_holemask))
1052 return xfs_lowbit64(rec->ir_free);
1053
1054 realfree = xfs_inobt_irec_to_allocmask(rec);
1055 realfree &= rec->ir_free;
1056
1057 return xfs_lowbit64(realfree);
d4cc540b
BF
1058}
1059
2935213a
DW
1060/*
1061 * If this AG has corrupt inodes, check if allocating this inode would fail
1062 * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
1063 * somewhere else.
1064 */
1065static int
1066xfs_dialloc_check_ino(
1067 struct xfs_perag *pag,
1068 struct xfs_trans *tp,
1069 xfs_ino_t ino)
1070{
1071 struct xfs_imap imap;
1072 struct xfs_buf *bp;
1073 int error;
1074
1075 error = xfs_imap(pag, tp, ino, &imap, 0);
1076 if (error)
1077 return -EAGAIN;
1078
1079 error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
1080 if (error)
1081 return -EAGAIN;
1082
1083 xfs_trans_brelse(tp, bp);
1084 return 0;
1085}
1086
1da177e4 1087/*
6dd8638e 1088 * Allocate an inode using the inobt-only algorithm.
1da177e4 1089 */
f2ecc5e4 1090STATIC int
6dd8638e 1091xfs_dialloc_ag_inobt(
dedab3e4 1092 struct xfs_perag *pag,
f2ecc5e4
CH
1093 struct xfs_trans *tp,
1094 struct xfs_buf *agbp,
1095 xfs_ino_t parent,
1096 xfs_ino_t *inop)
1da177e4 1097{
f2ecc5e4 1098 struct xfs_mount *mp = tp->t_mountp;
370c782b 1099 struct xfs_agi *agi = agbp->b_addr;
f2ecc5e4
CH
1100 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
1101 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
f2ecc5e4
CH
1102 struct xfs_btree_cur *cur, *tcur;
1103 struct xfs_inobt_rec_incore rec, trec;
1104 xfs_ino_t ino;
1105 int error;
1106 int offset;
1107 int i, j;
2d32311c 1108 int searchdistance = 10;
1da177e4 1109
7ac2ff8b
DC
1110 ASSERT(xfs_perag_initialised_agi(pag));
1111 ASSERT(xfs_perag_allows_inodes(pag));
4bb61069
CH
1112 ASSERT(pag->pagi_freecount > 0);
1113
bd169565 1114 restart_pagno:
14dd46cf 1115 cur = xfs_inobt_init_cursor(pag, tp, agbp);
1da177e4
LT
1116 /*
1117 * If pagino is 0 (this is the root inode allocation) use newino.
1118 * This must work because we've just allocated some.
1119 */
1120 if (!pagino)
16259e7d 1121 pagino = be32_to_cpu(agi->agi_newino);
1da177e4 1122
9ba0889e 1123 error = xfs_check_agi_freecount(cur);
0b48db80
DC
1124 if (error)
1125 goto error0;
1da177e4 1126
1da177e4 1127 /*
4254b0bb 1128 * If in the same AG as the parent, try to get near the parent.
1da177e4 1129 */
7b13c515 1130 if (pagno == pag->pag_agno) {
4254b0bb
CH
1131 int doneleft; /* done, to the left */
1132 int doneright; /* done, to the right */
1133
21875505 1134 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
4254b0bb 1135 if (error)
1da177e4 1136 goto error0;
f9e03706 1137 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1138 xfs_btree_mark_sick(cur);
f9e03706
DW
1139 error = -EFSCORRUPTED;
1140 goto error0;
1141 }
4254b0bb
CH
1142
1143 error = xfs_inobt_get_rec(cur, &rec, &j);
1144 if (error)
1145 goto error0;
f9e03706 1146 if (XFS_IS_CORRUPT(mp, j != 1)) {
989d5ec3 1147 xfs_btree_mark_sick(cur);
f9e03706
DW
1148 error = -EFSCORRUPTED;
1149 goto error0;
1150 }
4254b0bb
CH
1151
1152 if (rec.ir_freecount > 0) {
1da177e4
LT
1153 /*
1154 * Found a free inode in the same chunk
4254b0bb 1155 * as the parent, done.
1da177e4 1156 */
4254b0bb 1157 goto alloc_inode;
1da177e4 1158 }
4254b0bb
CH
1159
1160
1da177e4 1161 /*
4254b0bb 1162 * In the same AG as parent, but parent's chunk is full.
1da177e4 1163 */
1da177e4 1164
4254b0bb
CH
1165 /* duplicate the cursor, search left & right simultaneously */
1166 error = xfs_btree_dup_cursor(cur, &tcur);
1167 if (error)
1168 goto error0;
1169
bd169565
DC
1170 /*
1171 * Skip to last blocks looked up if same parent inode.
1172 */
1173 if (pagino != NULLAGINO &&
1174 pag->pagl_pagino == pagino &&
1175 pag->pagl_leftrec != NULLAGINO &&
1176 pag->pagl_rightrec != NULLAGINO) {
1177 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
43df2ee6 1178 &trec, &doneleft);
bd169565
DC
1179 if (error)
1180 goto error1;
4254b0bb 1181
bd169565 1182 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
43df2ee6 1183 &rec, &doneright);
bd169565
DC
1184 if (error)
1185 goto error1;
1186 } else {
1187 /* search left with tcur, back up 1 record */
1188 error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1189 if (error)
1190 goto error1;
1191
1192 /* search right with cur, go forward 1 record. */
1193 error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1194 if (error)
1195 goto error1;
1196 }
4254b0bb
CH
1197
1198 /*
1199 * Loop until we find an inode chunk with a free inode.
1200 */
2d32311c 1201 while (--searchdistance > 0 && (!doneleft || !doneright)) {
4254b0bb
CH
1202 int useleft; /* using left inode chunk this time */
1203
1204 /* figure out the closer block if both are valid. */
1205 if (!doneleft && !doneright) {
1206 useleft = pagino -
1207 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1208 rec.ir_startino - pagino;
1209 } else {
1210 useleft = !doneleft;
1da177e4 1211 }
4254b0bb
CH
1212
1213 /* free inodes to the left? */
1214 if (useleft && trec.ir_freecount) {
4254b0bb
CH
1215 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1216 cur = tcur;
bd169565
DC
1217
1218 pag->pagl_leftrec = trec.ir_startino;
1219 pag->pagl_rightrec = rec.ir_startino;
1220 pag->pagl_pagino = pagino;
c44245b3 1221 rec = trec;
4254b0bb 1222 goto alloc_inode;
1da177e4 1223 }
1da177e4 1224
4254b0bb
CH
1225 /* free inodes to the right? */
1226 if (!useleft && rec.ir_freecount) {
1227 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
bd169565
DC
1228
1229 pag->pagl_leftrec = trec.ir_startino;
1230 pag->pagl_rightrec = rec.ir_startino;
1231 pag->pagl_pagino = pagino;
4254b0bb 1232 goto alloc_inode;
1da177e4 1233 }
4254b0bb
CH
1234
1235 /* get next record to check */
1236 if (useleft) {
1237 error = xfs_ialloc_next_rec(tcur, &trec,
1238 &doneleft, 1);
1239 } else {
1240 error = xfs_ialloc_next_rec(cur, &rec,
1241 &doneright, 0);
1242 }
1243 if (error)
1244 goto error1;
1da177e4 1245 }
bd169565 1246
2d32311c
CM
1247 if (searchdistance <= 0) {
1248 /*
1249 * Not in range - save last search
1250 * location and allocate a new inode
1251 */
1252 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1253 pag->pagl_leftrec = trec.ir_startino;
1254 pag->pagl_rightrec = rec.ir_startino;
1255 pag->pagl_pagino = pagino;
1256
1257 } else {
1258 /*
1259 * We've reached the end of the btree. because
1260 * we are only searching a small chunk of the
1261 * btree each search, there is obviously free
1262 * inodes closer to the parent inode than we
1263 * are now. restart the search again.
1264 */
1265 pag->pagl_pagino = NULLAGINO;
1266 pag->pagl_leftrec = NULLAGINO;
1267 pag->pagl_rightrec = NULLAGINO;
1268 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1269 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1270 goto restart_pagno;
1271 }
1da177e4 1272 }
4254b0bb 1273
1da177e4 1274 /*
4254b0bb 1275 * In a different AG from the parent.
1da177e4
LT
1276 * See if the most recently allocated block has any free.
1277 */
69ef921b 1278 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
21875505
CH
1279 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1280 XFS_LOOKUP_EQ, &i);
4254b0bb 1281 if (error)
1da177e4 1282 goto error0;
4254b0bb
CH
1283
1284 if (i == 1) {
1285 error = xfs_inobt_get_rec(cur, &rec, &j);
1286 if (error)
1287 goto error0;
1288
1289 if (j == 1 && rec.ir_freecount > 0) {
1290 /*
1291 * The last chunk allocated in the group
1292 * still has a free inode.
1293 */
1294 goto alloc_inode;
1295 }
1da177e4 1296 }
bd169565 1297 }
4254b0bb 1298
bd169565
DC
1299 /*
1300 * None left in the last group, search the whole AG
1301 */
1302 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1303 if (error)
1304 goto error0;
f9e03706 1305 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1306 xfs_btree_mark_sick(cur);
f9e03706
DW
1307 error = -EFSCORRUPTED;
1308 goto error0;
1309 }
bd169565
DC
1310
1311 for (;;) {
1312 error = xfs_inobt_get_rec(cur, &rec, &i);
1313 if (error)
1314 goto error0;
f9e03706 1315 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1316 xfs_btree_mark_sick(cur);
f9e03706
DW
1317 error = -EFSCORRUPTED;
1318 goto error0;
1319 }
bd169565
DC
1320 if (rec.ir_freecount > 0)
1321 break;
1322 error = xfs_btree_increment(cur, 0, &i);
4254b0bb
CH
1323 if (error)
1324 goto error0;
f9e03706 1325 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1326 xfs_btree_mark_sick(cur);
f9e03706
DW
1327 error = -EFSCORRUPTED;
1328 goto error0;
1329 }
1da177e4 1330 }
4254b0bb
CH
1331
1332alloc_inode:
d4cc540b 1333 offset = xfs_inobt_first_free_inode(&rec);
1da177e4
LT
1334 ASSERT(offset >= 0);
1335 ASSERT(offset < XFS_INODES_PER_CHUNK);
1336 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1337 XFS_INODES_PER_CHUNK) == 0);
7b13c515 1338 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
2935213a
DW
1339
1340 if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1341 error = xfs_dialloc_check_ino(pag, tp, ino);
1342 if (error)
1343 goto error0;
1344 }
1345
0d87e656 1346 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1da177e4 1347 rec.ir_freecount--;
afabc24a
CH
1348 error = xfs_inobt_update(cur, &rec);
1349 if (error)
1da177e4 1350 goto error0;
413d57c9 1351 be32_add_cpu(&agi->agi_freecount, -1);
1da177e4 1352 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
44b56e0a 1353 pag->pagi_freecount--;
1da177e4 1354
9ba0889e 1355 error = xfs_check_agi_freecount(cur);
0b48db80
DC
1356 if (error)
1357 goto error0;
1358
1da177e4
LT
1359 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1360 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1361 *inop = ino;
1362 return 0;
1363error1:
1364 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1365error0:
1366 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1367 return error;
1368}
1369
6dd8638e
BF
1370/*
1371 * Use the free inode btree to allocate an inode based on distance from the
1372 * parent. Note that the provided cursor may be deleted and replaced.
1373 */
1374STATIC int
1375xfs_dialloc_ag_finobt_near(
1376 xfs_agino_t pagino,
1377 struct xfs_btree_cur **ocur,
1378 struct xfs_inobt_rec_incore *rec)
1379{
1380 struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
1381 struct xfs_btree_cur *rcur; /* right search cursor */
1382 struct xfs_inobt_rec_incore rrec;
1383 int error;
1384 int i, j;
1385
1386 error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1387 if (error)
1388 return error;
1389
1390 if (i == 1) {
1391 error = xfs_inobt_get_rec(lcur, rec, &i);
1392 if (error)
1393 return error;
989d5ec3
DW
1394 if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
1395 xfs_btree_mark_sick(lcur);
f9e03706 1396 return -EFSCORRUPTED;
989d5ec3 1397 }
6dd8638e
BF
1398
1399 /*
1400 * See if we've landed in the parent inode record. The finobt
1401 * only tracks chunks with at least one free inode, so record
1402 * existence is enough.
1403 */
1404 if (pagino >= rec->ir_startino &&
1405 pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1406 return 0;
1407 }
1408
1409 error = xfs_btree_dup_cursor(lcur, &rcur);
1410 if (error)
1411 return error;
1412
1413 error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1414 if (error)
1415 goto error_rcur;
1416 if (j == 1) {
1417 error = xfs_inobt_get_rec(rcur, &rrec, &j);
1418 if (error)
1419 goto error_rcur;
f9e03706 1420 if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
989d5ec3 1421 xfs_btree_mark_sick(lcur);
f9e03706
DW
1422 error = -EFSCORRUPTED;
1423 goto error_rcur;
1424 }
6dd8638e
BF
1425 }
1426
f9e03706 1427 if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
989d5ec3 1428 xfs_btree_mark_sick(lcur);
f9e03706
DW
1429 error = -EFSCORRUPTED;
1430 goto error_rcur;
1431 }
6dd8638e
BF
1432 if (i == 1 && j == 1) {
1433 /*
1434 * Both the left and right records are valid. Choose the closer
1435 * inode chunk to the target.
1436 */
1437 if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1438 (rrec.ir_startino - pagino)) {
1439 *rec = rrec;
1440 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1441 *ocur = rcur;
1442 } else {
1443 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1444 }
1445 } else if (j == 1) {
1446 /* only the right record is valid */
1447 *rec = rrec;
1448 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1449 *ocur = rcur;
1450 } else if (i == 1) {
1451 /* only the left record is valid */
1452 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1453 }
1454
1455 return 0;
1456
1457error_rcur:
1458 xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1459 return error;
1460}
1461
1462/*
1463 * Use the free inode btree to find a free inode based on a newino hint. If
1464 * the hint is NULL, find the first free inode in the AG.
1465 */
1466STATIC int
1467xfs_dialloc_ag_finobt_newino(
1468 struct xfs_agi *agi,
1469 struct xfs_btree_cur *cur,
1470 struct xfs_inobt_rec_incore *rec)
1471{
1472 int error;
1473 int i;
1474
1475 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
e68ed775
DC
1476 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1477 XFS_LOOKUP_EQ, &i);
6dd8638e
BF
1478 if (error)
1479 return error;
1480 if (i == 1) {
1481 error = xfs_inobt_get_rec(cur, rec, &i);
1482 if (error)
1483 return error;
989d5ec3
DW
1484 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1485 xfs_btree_mark_sick(cur);
f9e03706 1486 return -EFSCORRUPTED;
989d5ec3 1487 }
6dd8638e
BF
1488 return 0;
1489 }
1490 }
1491
1492 /*
1493 * Find the first inode available in the AG.
1494 */
1495 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1496 if (error)
1497 return error;
989d5ec3
DW
1498 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1499 xfs_btree_mark_sick(cur);
f9e03706 1500 return -EFSCORRUPTED;
989d5ec3 1501 }
6dd8638e
BF
1502
1503 error = xfs_inobt_get_rec(cur, rec, &i);
1504 if (error)
1505 return error;
989d5ec3
DW
1506 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1507 xfs_btree_mark_sick(cur);
f9e03706 1508 return -EFSCORRUPTED;
989d5ec3 1509 }
6dd8638e
BF
1510
1511 return 0;
1512}
1513
1514/*
1515 * Update the inobt based on a modification made to the finobt. Also ensure that
1516 * the records from both trees are equivalent post-modification.
1517 */
1518STATIC int
1519xfs_dialloc_ag_update_inobt(
1520 struct xfs_btree_cur *cur, /* inobt cursor */
1521 struct xfs_inobt_rec_incore *frec, /* finobt record */
1522 int offset) /* inode offset */
1523{
1524 struct xfs_inobt_rec_incore rec;
1525 int error;
1526 int i;
1527
1528 error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1529 if (error)
1530 return error;
989d5ec3
DW
1531 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1532 xfs_btree_mark_sick(cur);
f9e03706 1533 return -EFSCORRUPTED;
989d5ec3 1534 }
6dd8638e
BF
1535
1536 error = xfs_inobt_get_rec(cur, &rec, &i);
1537 if (error)
1538 return error;
989d5ec3
DW
1539 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1540 xfs_btree_mark_sick(cur);
f9e03706 1541 return -EFSCORRUPTED;
989d5ec3 1542 }
6dd8638e
BF
1543 ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1544 XFS_INODES_PER_CHUNK) == 0);
1545
1546 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1547 rec.ir_freecount--;
1548
f9e03706
DW
1549 if (XFS_IS_CORRUPT(cur->bc_mp,
1550 rec.ir_free != frec->ir_free ||
989d5ec3
DW
1551 rec.ir_freecount != frec->ir_freecount)) {
1552 xfs_btree_mark_sick(cur);
f9e03706 1553 return -EFSCORRUPTED;
989d5ec3 1554 }
6dd8638e 1555
b72091f2 1556 return xfs_inobt_update(cur, &rec);
6dd8638e
BF
1557}
1558
1559/*
1560 * Allocate an inode using the free inode btree, if available. Otherwise, fall
1561 * back to the inobt search algorithm.
1562 *
1563 * The caller selected an AG for us, and made sure that free inodes are
1564 * available.
1565 */
b652afd9 1566static int
6dd8638e 1567xfs_dialloc_ag(
dedab3e4 1568 struct xfs_perag *pag,
6dd8638e
BF
1569 struct xfs_trans *tp,
1570 struct xfs_buf *agbp,
1571 xfs_ino_t parent,
1572 xfs_ino_t *inop)
1573{
1574 struct xfs_mount *mp = tp->t_mountp;
370c782b 1575 struct xfs_agi *agi = agbp->b_addr;
6dd8638e
BF
1576 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
1577 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
6dd8638e
BF
1578 struct xfs_btree_cur *cur; /* finobt cursor */
1579 struct xfs_btree_cur *icur; /* inobt cursor */
1580 struct xfs_inobt_rec_incore rec;
1581 xfs_ino_t ino;
1582 int error;
1583 int offset;
1584 int i;
1585
ebd9027d 1586 if (!xfs_has_finobt(mp))
dedab3e4 1587 return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop);
6dd8638e 1588
6dd8638e
BF
1589 /*
1590 * If pagino is 0 (this is the root inode allocation) use newino.
1591 * This must work because we've just allocated some.
1592 */
1593 if (!pagino)
1594 pagino = be32_to_cpu(agi->agi_newino);
1595
14dd46cf 1596 cur = xfs_finobt_init_cursor(pag, tp, agbp);
6dd8638e 1597
9ba0889e 1598 error = xfs_check_agi_freecount(cur);
6dd8638e
BF
1599 if (error)
1600 goto error_cur;
1601
1602 /*
1603 * The search algorithm depends on whether we're in the same AG as the
1604 * parent. If so, find the closest available inode to the parent. If
1605 * not, consider the agi hint or find the first free inode in the AG.
1606 */
7b13c515 1607 if (pag->pag_agno == pagno)
6dd8638e
BF
1608 error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1609 else
1610 error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1611 if (error)
1612 goto error_cur;
1613
d4cc540b 1614 offset = xfs_inobt_first_free_inode(&rec);
6dd8638e
BF
1615 ASSERT(offset >= 0);
1616 ASSERT(offset < XFS_INODES_PER_CHUNK);
1617 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1618 XFS_INODES_PER_CHUNK) == 0);
7b13c515 1619 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
6dd8638e 1620
2935213a
DW
1621 if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1622 error = xfs_dialloc_check_ino(pag, tp, ino);
1623 if (error)
1624 goto error_cur;
1625 }
1626
6dd8638e
BF
1627 /*
1628 * Modify or remove the finobt record.
1629 */
1630 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1631 rec.ir_freecount--;
1632 if (rec.ir_freecount)
1633 error = xfs_inobt_update(cur, &rec);
1634 else
1635 error = xfs_btree_delete(cur, &i);
1636 if (error)
1637 goto error_cur;
1638
1639 /*
1640 * The finobt has now been updated appropriately. We haven't updated the
1641 * agi and superblock yet, so we can create an inobt cursor and validate
1642 * the original freecount. If all is well, make the equivalent update to
1643 * the inobt using the finobt record and offset information.
1644 */
14dd46cf 1645 icur = xfs_inobt_init_cursor(pag, tp, agbp);
6dd8638e 1646
9ba0889e 1647 error = xfs_check_agi_freecount(icur);
6dd8638e
BF
1648 if (error)
1649 goto error_icur;
1650
1651 error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1652 if (error)
1653 goto error_icur;
1654
1655 /*
1656 * Both trees have now been updated. We must update the perag and
1657 * superblock before we can check the freecount for each btree.
1658 */
1659 be32_add_cpu(&agi->agi_freecount, -1);
1660 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
7b13c515 1661 pag->pagi_freecount--;
6dd8638e
BF
1662
1663 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1664
9ba0889e 1665 error = xfs_check_agi_freecount(icur);
6dd8638e
BF
1666 if (error)
1667 goto error_icur;
9ba0889e 1668 error = xfs_check_agi_freecount(cur);
6dd8638e
BF
1669 if (error)
1670 goto error_icur;
1671
1672 xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1673 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
6dd8638e
BF
1674 *inop = ino;
1675 return 0;
1676
1677error_icur:
1678 xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1679error_cur:
1680 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
6dd8638e
BF
1681 return error;
1682}
1683
f3bf6e0f 1684static int
aececc9f
DC
1685xfs_dialloc_roll(
1686 struct xfs_trans **tpp,
1687 struct xfs_buf *agibp)
1688{
1689 struct xfs_trans *tp = *tpp;
1690 struct xfs_dquot_acct *dqinfo;
1691 int error;
1692
1693 /*
1694 * Hold to on to the agibp across the commit so no other allocation can
1695 * come in and take the free inodes we just allocated for our caller.
1696 */
1697 xfs_trans_bhold(tp, agibp);
1698
1699 /*
1700 * We want the quota changes to be associated with the next transaction,
1701 * NOT this one. So, detach the dqinfo from this and attach it to the
1702 * next transaction.
1703 */
1704 dqinfo = tp->t_dqinfo;
1705 tp->t_dqinfo = NULL;
1706
1707 error = xfs_trans_roll(&tp);
1708
1709 /* Re-attach the quota info that we detached from prev trx. */
1710 tp->t_dqinfo = dqinfo;
1711
8237fbf5
DC
1712 /*
1713 * Join the buffer even on commit error so that the buffer is released
1714 * when the caller cancels the transaction and doesn't have to handle
1715 * this error case specially.
1716 */
aececc9f 1717 xfs_trans_bjoin(tp, agibp);
8237fbf5
DC
1718 *tpp = tp;
1719 return error;
aececc9f
DC
1720}
1721
8237fbf5
DC
1722static bool
1723xfs_dialloc_good_ag(
8237fbf5 1724 struct xfs_perag *pag,
dedab3e4 1725 struct xfs_trans *tp,
8237fbf5
DC
1726 umode_t mode,
1727 int flags,
1728 bool ok_alloc)
1729{
1730 struct xfs_mount *mp = tp->t_mountp;
1731 xfs_extlen_t ineed;
1732 xfs_extlen_t longest = 0;
1733 int needspace;
1734 int error;
1735
dedab3e4
DC
1736 if (!pag)
1737 return false;
7ac2ff8b 1738 if (!xfs_perag_allows_inodes(pag))
8237fbf5
DC
1739 return false;
1740
7ac2ff8b 1741 if (!xfs_perag_initialised_agi(pag)) {
549d3c9a 1742 error = xfs_ialloc_read_agi(pag, tp, 0, NULL);
8237fbf5
DC
1743 if (error)
1744 return false;
1745 }
1746
1747 if (pag->pagi_freecount)
1748 return true;
1749 if (!ok_alloc)
1750 return false;
1751
7ac2ff8b 1752 if (!xfs_perag_initialised_agf(pag)) {
08d3e84f 1753 error = xfs_alloc_read_agf(pag, tp, flags, NULL);
8237fbf5
DC
1754 if (error)
1755 return false;
1756 }
1757
1758 /*
1759 * Check that there is enough free space for the file plus a chunk of
1760 * inodes if we need to allocate some. If this is the first pass across
1761 * the AGs, take into account the potential space needed for alignment
1762 * of inode chunks when checking the longest contiguous free space in
1763 * the AG - this prevents us from getting ENOSPC because we have free
1764 * space larger than ialloc_blks but alignment constraints prevent us
1765 * from using it.
1766 *
1767 * If we can't find an AG with space for full alignment slack to be
1768 * taken into account, we must be near ENOSPC in all AGs. Hence we
1769 * don't include alignment for the second pass and so if we fail
1770 * allocation due to alignment issues then it is most likely a real
1771 * ENOSPC condition.
1772 *
1773 * XXX(dgc): this calculation is now bogus thanks to the per-ag
1774 * reservations that xfs_alloc_fix_freelist() now does via
1775 * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1776 * be more than large enough for the check below to succeed, but
1777 * xfs_alloc_space_available() will fail because of the non-zero
1778 * metadata reservation and hence we won't actually be able to allocate
1779 * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1780 * because of this.
1781 */
1782 ineed = M_IGEO(mp)->ialloc_min_blks;
1783 if (flags && ineed > 1)
1784 ineed += M_IGEO(mp)->cluster_align;
1785 longest = pag->pagf_longest;
1786 if (!longest)
1787 longest = pag->pagf_flcount > 0;
1788 needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
1789
1790 if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
1791 return false;
1792 return true;
1793}
1794
1795static int
1796xfs_dialloc_try_ag(
8237fbf5 1797 struct xfs_perag *pag,
dedab3e4 1798 struct xfs_trans **tpp,
8237fbf5
DC
1799 xfs_ino_t parent,
1800 xfs_ino_t *new_ino,
1801 bool ok_alloc)
1802{
1803 struct xfs_buf *agbp;
1804 xfs_ino_t ino;
1805 int error;
1806
1807 /*
1808 * Then read in the AGI buffer and recheck with the AGI buffer
1809 * lock held.
1810 */
549d3c9a 1811 error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp);
8237fbf5
DC
1812 if (error)
1813 return error;
1814
1815 if (!pag->pagi_freecount) {
1816 if (!ok_alloc) {
1817 error = -EAGAIN;
1818 goto out_release;
1819 }
1820
dedab3e4 1821 error = xfs_ialloc_ag_alloc(pag, *tpp, agbp);
8237fbf5
DC
1822 if (error < 0)
1823 goto out_release;
1824
1825 /*
1826 * We successfully allocated space for an inode cluster in this
1827 * AG. Roll the transaction so that we can allocate one of the
1828 * new inodes.
1829 */
1830 ASSERT(pag->pagi_freecount > 0);
1831 error = xfs_dialloc_roll(tpp, agbp);
1832 if (error)
1833 goto out_release;
1834 }
1835
1836 /* Allocate an inode in the found AG */
dedab3e4 1837 error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino);
8237fbf5
DC
1838 if (!error)
1839 *new_ino = ino;
1840 return error;
1841
1842out_release:
1843 xfs_trans_brelse(*tpp, agbp);
1844 return error;
1845}
1846
f2ecc5e4 1847/*
8237fbf5 1848 * Allocate an on-disk inode.
f2ecc5e4 1849 *
8d822dc3 1850 * Mode is used to tell whether the new inode is a directory and hence where to
8237fbf5
DC
1851 * locate it. The on-disk inode that is allocated will be returned in @new_ino
1852 * on success, otherwise an error will be set to indicate the failure (e.g.
1853 * -ENOSPC).
f2ecc5e4
CH
1854 */
1855int
b652afd9 1856xfs_dialloc(
f3bf6e0f 1857 struct xfs_trans **tpp,
390b4775 1858 const struct xfs_icreate_args *args,
b652afd9 1859 xfs_ino_t *new_ino)
f2ecc5e4 1860{
f3bf6e0f 1861 struct xfs_mount *mp = (*tpp)->t_mountp;
390b4775
DW
1862 xfs_ino_t parent = args->pip ? args->pip->i_ino : 0;
1863 umode_t mode = args->mode & S_IFMT;
f2ecc5e4 1864 xfs_agnumber_t agno;
b652afd9 1865 int error = 0;
be60fe54 1866 xfs_agnumber_t start_agno;
f2ecc5e4 1867 struct xfs_perag *pag;
ef325959 1868 struct xfs_ino_geometry *igeo = M_IGEO(mp);
8237fbf5 1869 bool ok_alloc = true;
f08f984c 1870 bool low_space = false;
89b1f55a 1871 int flags;
76257a15 1872 xfs_ino_t ino = NULLFSINO;
8d822dc3 1873
4bb61069 1874 /*
89b1f55a
DC
1875 * Directories, symlinks, and regular files frequently allocate at least
1876 * one block, so factor that potential expansion when we examine whether
1877 * an AG has enough space for file creation.
4bb61069 1878 */
89b1f55a 1879 if (S_ISDIR(mode))
6e2985c9
DW
1880 start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
1881 mp->m_maxagi;
89b1f55a
DC
1882 else {
1883 start_agno = XFS_INO_TO_AGNO(mp, parent);
1884 if (start_agno >= mp->m_maxagi)
1885 start_agno = 0;
1886 }
55d6af64 1887
f2ecc5e4
CH
1888 /*
1889 * If we have already hit the ceiling of inode blocks then clear
8237fbf5 1890 * ok_alloc so we scan all available agi structures for a free
f2ecc5e4 1891 * inode.
74f9ce1c
GW
1892 *
1893 * Read rough value of mp->m_icount by percpu_counter_read_positive,
1894 * which will sacrifice the preciseness but improve the performance.
f2ecc5e4 1895 */
ef325959
DW
1896 if (igeo->maxicount &&
1897 percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
1898 > igeo->maxicount) {
8237fbf5 1899 ok_alloc = false;
f2ecc5e4
CH
1900 }
1901
f08f984c
DC
1902 /*
1903 * If we are near to ENOSPC, we want to prefer allocation from AGs that
1904 * have free inodes in them rather than use up free space allocating new
1905 * inode chunks. Hence we turn off allocation for the first non-blocking
1906 * pass through the AGs if we are near ENOSPC to consume free inodes
1907 * that we can immediately allocate, but then we allow allocation on the
1908 * second pass if we fail to find an AG with free inodes in it.
1909 */
1910 if (percpu_counter_read_positive(&mp->m_fdblocks) <
1911 mp->m_low_space[XFS_LOWSP_1_PCNT]) {
1912 ok_alloc = false;
1913 low_space = true;
1914 }
1915
f2ecc5e4
CH
1916 /*
1917 * Loop until we find an allocation group that either has free inodes
1918 * or in which we can allocate some inodes. Iterate through the
1919 * allocation groups upward, wrapping at the end.
1920 */
89b1f55a 1921 flags = XFS_ALLOC_FLAG_TRYLOCK;
76257a15
DC
1922retry:
1923 for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) {
dedab3e4
DC
1924 if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) {
1925 error = xfs_dialloc_try_ag(pag, tpp, parent,
8237fbf5
DC
1926 &ino, ok_alloc);
1927 if (error != -EAGAIN)
42685473 1928 break;
76257a15 1929 error = 0;
f2ecc5e4 1930 }
be60fe54 1931
75c8c50f 1932 if (xfs_is_shutdown(mp)) {
89b1f55a 1933 error = -EFSCORRUPTED;
42685473 1934 break;
89b1f55a 1935 }
76257a15
DC
1936 }
1937 if (pag)
1938 xfs_perag_rele(pag);
1939 if (error)
1940 return error;
1941 if (ino == NULLFSINO) {
1942 if (flags) {
89b1f55a 1943 flags = 0;
f08f984c
DC
1944 if (low_space)
1945 ok_alloc = true;
76257a15 1946 goto retry;
89b1f55a 1947 }
76257a15 1948 return -ENOSPC;
f2ecc5e4 1949 }
38fd3d6a
DW
1950
1951 /*
1952 * Protect against obviously corrupt allocation btree records. Later
1953 * xfs_iget checks will catch re-allocation of other active in-memory
1954 * and on-disk inodes. If we don't catch reallocating the parent inode
1955 * here we will deadlock in xfs_iget() so we have to do these checks
1956 * first.
1957 */
1958 if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
1959 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
1960 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
1961 XFS_SICK_AG_INOBT);
1962 return -EFSCORRUPTED;
1963 }
1964
76257a15
DC
1965 *new_ino = ino;
1966 return 0;
f2ecc5e4
CH
1967}
1968
10ae3dc7
BF
1969/*
1970 * Free the blocks of an inode chunk. We must consider that the inode chunk
1971 * might be sparse and only free the regions that are allocated as part of the
1972 * chunk.
1973 */
7dfee17b 1974static int
10ae3dc7 1975xfs_difree_inode_chunk(
0f37d178 1976 struct xfs_trans *tp,
10ae3dc7 1977 xfs_agnumber_t agno,
0f37d178 1978 struct xfs_inobt_rec_incore *rec)
10ae3dc7 1979{
0f37d178
BF
1980 struct xfs_mount *mp = tp->t_mountp;
1981 xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp,
1982 rec->ir_startino);
1983 int startidx, endidx;
1984 int nextbit;
1985 xfs_agblock_t agbno;
1986 int contigblk;
10ae3dc7
BF
1987 DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
1988
1989 if (!xfs_inobt_issparse(rec->ir_holemask)) {
1990 /* not sparse, calculate extent info directly */
7dfee17b
DC
1991 return xfs_free_extent_later(tp,
1992 XFS_AGB_TO_FSB(mp, agno, sagbno),
b742d7b4 1993 M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
980faece 1994 XFS_AG_RESV_NONE, 0);
10ae3dc7
BF
1995 }
1996
1997 /* holemask is only 16-bits (fits in an unsigned long) */
1998 ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
1999 holemask[0] = rec->ir_holemask;
2000
2001 /*
2002 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
2003 * holemask and convert the start/end index of each range to an extent.
2004 * We start with the start and end index both pointing at the first 0 in
2005 * the mask.
2006 */
2007 startidx = endidx = find_first_zero_bit(holemask,
2008 XFS_INOBT_HOLEMASK_BITS);
2009 nextbit = startidx + 1;
2010 while (startidx < XFS_INOBT_HOLEMASK_BITS) {
7dfee17b
DC
2011 int error;
2012
10ae3dc7
BF
2013 nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
2014 nextbit);
2015 /*
2016 * If the next zero bit is contiguous, update the end index of
2017 * the current range and continue.
2018 */
2019 if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
2020 nextbit == endidx + 1) {
2021 endidx = nextbit;
2022 goto next;
2023 }
2024
2025 /*
2026 * nextbit is not contiguous with the current end index. Convert
2027 * the current start/end to an extent and add it to the free
2028 * list.
2029 */
2030 agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
2031 mp->m_sb.sb_inopblock;
2032 contigblk = ((endidx - startidx + 1) *
2033 XFS_INODES_PER_HOLEMASK_BIT) /
2034 mp->m_sb.sb_inopblock;
2035
2036 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
2037 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
7dfee17b 2038 error = xfs_free_extent_later(tp,
b742d7b4 2039 XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
980faece 2040 &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0);
7dfee17b
DC
2041 if (error)
2042 return error;
10ae3dc7
BF
2043
2044 /* reset range to current bit and carry on... */
2045 startidx = endidx = nextbit;
2046
2047next:
2048 nextbit++;
2049 }
7dfee17b 2050 return 0;
10ae3dc7
BF
2051}
2052
2b64ee5c
BF
2053STATIC int
2054xfs_difree_inobt(
dedab3e4 2055 struct xfs_perag *pag,
2b64ee5c
BF
2056 struct xfs_trans *tp,
2057 struct xfs_buf *agbp,
2058 xfs_agino_t agino,
09b56604 2059 struct xfs_icluster *xic,
2b64ee5c 2060 struct xfs_inobt_rec_incore *orec)
1da177e4 2061{
dedab3e4 2062 struct xfs_mount *mp = pag->pag_mount;
370c782b 2063 struct xfs_agi *agi = agbp->b_addr;
2b64ee5c
BF
2064 struct xfs_btree_cur *cur;
2065 struct xfs_inobt_rec_incore rec;
2066 int ilen;
2067 int error;
2068 int i;
2069 int off;
1da177e4 2070
69ef921b 2071 ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2b64ee5c
BF
2072 ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
2073
1da177e4
LT
2074 /*
2075 * Initialize the cursor.
2076 */
14dd46cf 2077 cur = xfs_inobt_init_cursor(pag, tp, agbp);
1da177e4 2078
9ba0889e 2079 error = xfs_check_agi_freecount(cur);
0b48db80
DC
2080 if (error)
2081 goto error0;
2082
1da177e4
LT
2083 /*
2084 * Look for the entry describing this inode.
2085 */
21875505 2086 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
0b932ccc
DC
2087 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
2088 __func__, error);
1da177e4
LT
2089 goto error0;
2090 }
f9e03706 2091 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 2092 xfs_btree_mark_sick(cur);
f9e03706
DW
2093 error = -EFSCORRUPTED;
2094 goto error0;
2095 }
2e287a73
CH
2096 error = xfs_inobt_get_rec(cur, &rec, &i);
2097 if (error) {
0b932ccc
DC
2098 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
2099 __func__, error);
1da177e4
LT
2100 goto error0;
2101 }
f9e03706 2102 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 2103 xfs_btree_mark_sick(cur);
f9e03706
DW
2104 error = -EFSCORRUPTED;
2105 goto error0;
2106 }
1da177e4
LT
2107 /*
2108 * Get the offset in the inode chunk.
2109 */
2110 off = agino - rec.ir_startino;
2111 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
0d87e656 2112 ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1da177e4
LT
2113 /*
2114 * Mark the inode free & increment the count.
2115 */
0d87e656 2116 rec.ir_free |= XFS_INOBT_MASK(off);
1da177e4
LT
2117 rec.ir_freecount++;
2118
2119 /*
999633d3
BF
2120 * When an inode chunk is free, it becomes eligible for removal. Don't
2121 * remove the chunk if the block size is large enough for multiple inode
2122 * chunks (that might not be free).
1da177e4 2123 */
0560f31a 2124 if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
999633d3 2125 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
749f24f3 2126 xic->deleted = true;
7b13c515
DC
2127 xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
2128 rec.ir_startino);
09b56604 2129 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
1da177e4
LT
2130
2131 /*
2132 * Remove the inode cluster from the AGI B+Tree, adjust the
2133 * AGI and Superblock inode counts, and mark the disk space
2134 * to be freed when the transaction is committed.
2135 */
999633d3 2136 ilen = rec.ir_freecount;
413d57c9
MS
2137 be32_add_cpu(&agi->agi_count, -ilen);
2138 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1da177e4 2139 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
44b56e0a 2140 pag->pagi_freecount -= ilen - 1;
89e9b5c0 2141 pag->pagi_count -= ilen;
1da177e4
LT
2142 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
2143 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
2144
91cca5df 2145 if ((error = xfs_btree_delete(cur, &i))) {
0b932ccc
DC
2146 xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
2147 __func__, error);
1da177e4
LT
2148 goto error0;
2149 }
2150
7dfee17b
DC
2151 error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
2152 if (error)
2153 goto error0;
1da177e4 2154 } else {
749f24f3 2155 xic->deleted = false;
1da177e4 2156
afabc24a
CH
2157 error = xfs_inobt_update(cur, &rec);
2158 if (error) {
0b932ccc
DC
2159 xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
2160 __func__, error);
1da177e4
LT
2161 goto error0;
2162 }
afabc24a 2163
b7df7630 2164 /*
1da177e4
LT
2165 * Change the inode free counts and log the ag/sb changes.
2166 */
413d57c9 2167 be32_add_cpu(&agi->agi_freecount, 1);
1da177e4 2168 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
7b13c515 2169 pag->pagi_freecount++;
1da177e4
LT
2170 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2171 }
2172
9ba0889e 2173 error = xfs_check_agi_freecount(cur);
0b48db80
DC
2174 if (error)
2175 goto error0;
1da177e4 2176
2b64ee5c 2177 *orec = rec;
1da177e4
LT
2178 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2179 return 0;
2180
2181error0:
2182 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2183 return error;
2184}
2185
3efa4ffd
BF
2186/*
2187 * Free an inode in the free inode btree.
2188 */
2189STATIC int
2190xfs_difree_finobt(
dedab3e4 2191 struct xfs_perag *pag,
3efa4ffd
BF
2192 struct xfs_trans *tp,
2193 struct xfs_buf *agbp,
2194 xfs_agino_t agino,
2195 struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
2196{
dedab3e4 2197 struct xfs_mount *mp = pag->pag_mount;
3efa4ffd
BF
2198 struct xfs_btree_cur *cur;
2199 struct xfs_inobt_rec_incore rec;
2200 int offset = agino - ibtrec->ir_startino;
2201 int error;
2202 int i;
2203
14dd46cf 2204 cur = xfs_finobt_init_cursor(pag, tp, agbp);
3efa4ffd
BF
2205
2206 error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2207 if (error)
2208 goto error;
2209 if (i == 0) {
2210 /*
2211 * If the record does not exist in the finobt, we must have just
2212 * freed an inode in a previously fully allocated chunk. If not,
2213 * something is out of sync.
2214 */
f9e03706 2215 if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
989d5ec3 2216 xfs_btree_mark_sick(cur);
f9e03706
DW
2217 error = -EFSCORRUPTED;
2218 goto error;
2219 }
3efa4ffd 2220
5419040f
BF
2221 error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2222 ibtrec->ir_count,
2223 ibtrec->ir_freecount,
3efa4ffd
BF
2224 ibtrec->ir_free, &i);
2225 if (error)
2226 goto error;
2227 ASSERT(i == 1);
2228
2229 goto out;
2230 }
2231
2232 /*
2233 * Read and update the existing record. We could just copy the ibtrec
2234 * across here, but that would defeat the purpose of having redundant
2235 * metadata. By making the modifications independently, we can catch
2236 * corruptions that we wouldn't see if we just copied from one record
2237 * to another.
2238 */
2239 error = xfs_inobt_get_rec(cur, &rec, &i);
2240 if (error)
2241 goto error;
f9e03706 2242 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 2243 xfs_btree_mark_sick(cur);
f9e03706
DW
2244 error = -EFSCORRUPTED;
2245 goto error;
2246 }
3efa4ffd
BF
2247
2248 rec.ir_free |= XFS_INOBT_MASK(offset);
2249 rec.ir_freecount++;
2250
f9e03706
DW
2251 if (XFS_IS_CORRUPT(mp,
2252 rec.ir_free != ibtrec->ir_free ||
2253 rec.ir_freecount != ibtrec->ir_freecount)) {
989d5ec3 2254 xfs_btree_mark_sick(cur);
f9e03706
DW
2255 error = -EFSCORRUPTED;
2256 goto error;
2257 }
3efa4ffd
BF
2258
2259 /*
2260 * The content of inobt records should always match between the inobt
2261 * and finobt. The lifecycle of records in the finobt is different from
2262 * the inobt in that the finobt only tracks records with at least one
2263 * free inode. Hence, if all of the inodes are free and we aren't
2264 * keeping inode chunks permanently on disk, remove the record.
2265 * Otherwise, update the record with the new information.
999633d3
BF
2266 *
2267 * Note that we currently can't free chunks when the block size is large
2268 * enough for multiple chunks. Leave the finobt record to remain in sync
2269 * with the inobt.
3efa4ffd 2270 */
0560f31a
DC
2271 if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2272 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
3efa4ffd
BF
2273 error = xfs_btree_delete(cur, &i);
2274 if (error)
2275 goto error;
2276 ASSERT(i == 1);
2277 } else {
2278 error = xfs_inobt_update(cur, &rec);
2279 if (error)
2280 goto error;
2281 }
2282
2283out:
9ba0889e 2284 error = xfs_check_agi_freecount(cur);
3efa4ffd
BF
2285 if (error)
2286 goto error;
2287
2288 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2289 return 0;
2290
2291error:
2292 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2293 return error;
2294}
2295
2b64ee5c
BF
2296/*
2297 * Free disk inode. Carefully avoids touching the incore inode, all
2298 * manipulations incore are the caller's responsibility.
2299 * The on-disk inode is not changed by this operation, only the
2300 * btree (free inode mask) is changed.
2301 */
2302int
2303xfs_difree(
f40aadb2
DC
2304 struct xfs_trans *tp,
2305 struct xfs_perag *pag,
2306 xfs_ino_t inode,
2307 struct xfs_icluster *xic)
2b64ee5c
BF
2308{
2309 /* REFERENCED */
2310 xfs_agblock_t agbno; /* block number containing inode */
2311 struct xfs_buf *agbp; /* buffer for allocation group header */
2312 xfs_agino_t agino; /* allocation group inode number */
2b64ee5c 2313 int error; /* error return value */
7b13c515 2314 struct xfs_mount *mp = tp->t_mountp;
2b64ee5c 2315 struct xfs_inobt_rec_incore rec;/* btree record */
2b64ee5c
BF
2316
2317 /*
2318 * Break up inode number into its components.
2319 */
f40aadb2
DC
2320 if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
2321 xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
2322 __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
2b64ee5c 2323 ASSERT(0);
2451337d 2324 return -EINVAL;
2b64ee5c
BF
2325 }
2326 agino = XFS_INO_TO_AGINO(mp, inode);
f40aadb2 2327 if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
2b64ee5c
BF
2328 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
2329 __func__, (unsigned long long)inode,
f40aadb2 2330 (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
2b64ee5c 2331 ASSERT(0);
2451337d 2332 return -EINVAL;
2b64ee5c
BF
2333 }
2334 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2335 if (agbno >= mp->m_sb.sb_agblocks) {
2336 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
2337 __func__, agbno, mp->m_sb.sb_agblocks);
2338 ASSERT(0);
2451337d 2339 return -EINVAL;
2b64ee5c
BF
2340 }
2341 /*
2342 * Get the allocation group header.
2343 */
549d3c9a 2344 error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2b64ee5c
BF
2345 if (error) {
2346 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2347 __func__, error);
2348 return error;
2349 }
2350
2351 /*
2352 * Fix up the inode allocation btree.
2353 */
dedab3e4 2354 error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec);
2b64ee5c
BF
2355 if (error)
2356 goto error0;
2357
3efa4ffd
BF
2358 /*
2359 * Fix up the free inode btree.
2360 */
ebd9027d 2361 if (xfs_has_finobt(mp)) {
dedab3e4 2362 error = xfs_difree_finobt(pag, tp, agbp, agino, &rec);
3efa4ffd
BF
2363 if (error)
2364 goto error0;
2365 }
2366
2b64ee5c
BF
2367 return 0;
2368
2369error0:
2370 return error;
2371}
2372
7124fe0a
DC
2373STATIC int
2374xfs_imap_lookup(
7b13c515 2375 struct xfs_perag *pag,
498f0adb 2376 struct xfs_trans *tp,
7124fe0a
DC
2377 xfs_agino_t agino,
2378 xfs_agblock_t agbno,
2379 xfs_agblock_t *chunk_agbno,
2380 xfs_agblock_t *offset_agbno,
2381 int flags)
2382{
498f0adb 2383 struct xfs_mount *mp = pag->pag_mount;
7124fe0a
DC
2384 struct xfs_inobt_rec_incore rec;
2385 struct xfs_btree_cur *cur;
2386 struct xfs_buf *agbp;
7124fe0a
DC
2387 int error;
2388 int i;
2389
549d3c9a 2390 error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
7124fe0a 2391 if (error) {
53487786
DC
2392 xfs_alert(mp,
2393 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
7b13c515 2394 __func__, error, pag->pag_agno);
7124fe0a
DC
2395 return error;
2396 }
2397
2398 /*
4536f2ad
DC
2399 * Lookup the inode record for the given agino. If the record cannot be
2400 * found, then it's an invalid inode number and we should abort. Once
2401 * we have a record, we need to ensure it contains the inode number
2402 * we are looking up.
7124fe0a 2403 */
14dd46cf 2404 cur = xfs_inobt_init_cursor(pag, tp, agbp);
4536f2ad 2405 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
7124fe0a
DC
2406 if (!error) {
2407 if (i)
2408 error = xfs_inobt_get_rec(cur, &rec, &i);
2409 if (!error && i == 0)
2451337d 2410 error = -EINVAL;
7124fe0a
DC
2411 }
2412
2413 xfs_trans_brelse(tp, agbp);
0b04b6b8 2414 xfs_btree_del_cursor(cur, error);
7124fe0a
DC
2415 if (error)
2416 return error;
2417
4536f2ad
DC
2418 /* check that the returned record contains the required inode */
2419 if (rec.ir_startino > agino ||
ef325959 2420 rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
2451337d 2421 return -EINVAL;
4536f2ad 2422
7124fe0a 2423 /* for untrusted inodes check it is allocated first */
1920779e 2424 if ((flags & XFS_IGET_UNTRUSTED) &&
7124fe0a 2425 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2451337d 2426 return -EINVAL;
7124fe0a
DC
2427
2428 *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2429 *offset_agbno = agbno - *chunk_agbno;
2430 return 0;
2431}
2432
1da177e4 2433/*
94e1b69d 2434 * Return the location of the inode in imap, for mapping it into a buffer.
1da177e4 2435 */
1da177e4 2436int
94e1b69d 2437xfs_imap(
498f0adb
DC
2438 struct xfs_perag *pag,
2439 struct xfs_trans *tp,
7b13c515
DC
2440 xfs_ino_t ino, /* inode to locate */
2441 struct xfs_imap *imap, /* location map structure */
2442 uint flags) /* flags for inode btree lookup */
1da177e4 2443{
498f0adb 2444 struct xfs_mount *mp = pag->pag_mount;
7b13c515
DC
2445 xfs_agblock_t agbno; /* block number of inode in the alloc group */
2446 xfs_agino_t agino; /* inode number within alloc group */
2447 xfs_agblock_t chunk_agbno; /* first block in inode chunk */
2448 xfs_agblock_t cluster_agbno; /* first block in inode cluster */
2449 int error; /* error code */
2450 int offset; /* index of inode in its buffer */
2451 xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
1da177e4
LT
2452
2453 ASSERT(ino != NULLFSINO);
94e1b69d 2454
1da177e4
LT
2455 /*
2456 * Split up the inode number into its parts.
2457 */
1da177e4
LT
2458 agino = XFS_INO_TO_AGINO(mp, ino);
2459 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
498f0adb 2460 if (agbno >= mp->m_sb.sb_agblocks ||
7b13c515
DC
2461 ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
2462 error = -EINVAL;
1da177e4 2463#ifdef DEBUG
1920779e
DC
2464 /*
2465 * Don't output diagnostic information for untrusted inodes
2466 * as they can be invalid without implying corruption.
2467 */
2468 if (flags & XFS_IGET_UNTRUSTED)
498f0adb 2469 return error;
1da177e4 2470 if (agbno >= mp->m_sb.sb_agblocks) {
53487786
DC
2471 xfs_alert(mp,
2472 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2473 __func__, (unsigned long long)agbno,
2474 (unsigned long)mp->m_sb.sb_agblocks);
1da177e4 2475 }
498f0adb 2476 if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
53487786
DC
2477 xfs_alert(mp,
2478 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
2479 __func__, ino,
7b13c515 2480 XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
1da177e4 2481 }
745b1f47 2482 xfs_stack_trace();
1da177e4 2483#endif /* DEBUG */
498f0adb 2484 return error;
1da177e4 2485 }
94e1b69d 2486
7124fe0a
DC
2487 /*
2488 * For bulkstat and handle lookups, we have an untrusted inode number
2489 * that we have to verify is valid. We cannot do this just by reading
2490 * the inode buffer as it may have been unlinked and removed leaving
2491 * inodes in stale state on disk. Hence we have to do a btree lookup
2492 * in all cases where an untrusted inode number is passed.
2493 */
1920779e 2494 if (flags & XFS_IGET_UNTRUSTED) {
498f0adb 2495 error = xfs_imap_lookup(pag, tp, agino, agbno,
7124fe0a
DC
2496 &chunk_agbno, &offset_agbno, flags);
2497 if (error)
498f0adb 2498 return error;
7124fe0a
DC
2499 goto out_map;
2500 }
2501
94e1b69d
CH
2502 /*
2503 * If the inode cluster size is the same as the blocksize or
2504 * smaller we get to the buffer by simple arithmetics.
2505 */
ef325959 2506 if (M_IGEO(mp)->blocks_per_cluster == 1) {
1da177e4
LT
2507 offset = XFS_INO_TO_OFFSET(mp, ino);
2508 ASSERT(offset < mp->m_sb.sb_inopblock);
94e1b69d 2509
7b13c515 2510 imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
94e1b69d 2511 imap->im_len = XFS_FSB_TO_BB(mp, 1);
755c7bf5
DW
2512 imap->im_boffset = (unsigned short)(offset <<
2513 mp->m_sb.sb_inodelog);
498f0adb 2514 return 0;
1da177e4 2515 }
94e1b69d 2516
94e1b69d
CH
2517 /*
2518 * If the inode chunks are aligned then use simple maths to
2519 * find the location. Otherwise we have to do a btree
2520 * lookup to find the location.
2521 */
ef325959
DW
2522 if (M_IGEO(mp)->inoalign_mask) {
2523 offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
1da177e4
LT
2524 chunk_agbno = agbno - offset_agbno;
2525 } else {
498f0adb 2526 error = xfs_imap_lookup(pag, tp, agino, agbno,
7124fe0a 2527 &chunk_agbno, &offset_agbno, flags);
1da177e4 2528 if (error)
498f0adb 2529 return error;
1da177e4 2530 }
94e1b69d 2531
7124fe0a 2532out_map:
1da177e4
LT
2533 ASSERT(agbno >= chunk_agbno);
2534 cluster_agbno = chunk_agbno +
ef325959
DW
2535 ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
2536 M_IGEO(mp)->blocks_per_cluster);
1da177e4
LT
2537 offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2538 XFS_INO_TO_OFFSET(mp, ino);
94e1b69d 2539
7b13c515 2540 imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
ef325959 2541 imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
755c7bf5 2542 imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
94e1b69d
CH
2543
2544 /*
2545 * If the inode number maps to a block outside the bounds
2546 * of the file system then return NULL rather than calling
2547 * read_buf and panicing when we get an error from the
2548 * driver.
2549 */
2550 if ((imap->im_blkno + imap->im_len) >
2551 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
53487786
DC
2552 xfs_alert(mp,
2553 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2554 __func__, (unsigned long long) imap->im_blkno,
94e1b69d
CH
2555 (unsigned long long) imap->im_len,
2556 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
498f0adb 2557 return -EINVAL;
94e1b69d 2558 }
498f0adb 2559 return 0;
1da177e4
LT
2560}
2561
1da177e4 2562/*
aafc3c24
BF
2563 * Log specified fields for the ag hdr (inode section). The growth of the agi
2564 * structure over time requires that we interpret the buffer as two logical
2565 * regions delineated by the end of the unlinked list. This is due to the size
2566 * of the hash table and its location in the middle of the agi.
2567 *
2568 * For example, a request to log a field before agi_unlinked and a field after
2569 * agi_unlinked could cause us to log the entire hash table and use an excessive
2570 * amount of log space. To avoid this behavior, log the region up through
2571 * agi_unlinked in one call and the region after agi_unlinked through the end of
2572 * the structure in another.
1da177e4
LT
2573 */
2574void
2575xfs_ialloc_log_agi(
0d1b9769
DC
2576 struct xfs_trans *tp,
2577 struct xfs_buf *bp,
2578 uint32_t fields)
1da177e4
LT
2579{
2580 int first; /* first byte number */
2581 int last; /* last byte number */
2582 static const short offsets[] = { /* field starting offsets */
2583 /* keep in sync with bit definitions */
2584 offsetof(xfs_agi_t, agi_magicnum),
2585 offsetof(xfs_agi_t, agi_versionnum),
2586 offsetof(xfs_agi_t, agi_seqno),
2587 offsetof(xfs_agi_t, agi_length),
2588 offsetof(xfs_agi_t, agi_count),
2589 offsetof(xfs_agi_t, agi_root),
2590 offsetof(xfs_agi_t, agi_level),
2591 offsetof(xfs_agi_t, agi_freecount),
2592 offsetof(xfs_agi_t, agi_newino),
2593 offsetof(xfs_agi_t, agi_dirino),
2594 offsetof(xfs_agi_t, agi_unlinked),
aafc3c24
BF
2595 offsetof(xfs_agi_t, agi_free_root),
2596 offsetof(xfs_agi_t, agi_free_level),
2a39946c 2597 offsetof(xfs_agi_t, agi_iblocks),
1da177e4
LT
2598 sizeof(xfs_agi_t)
2599 };
2600#ifdef DEBUG
370c782b 2601 struct xfs_agi *agi = bp->b_addr;
1da177e4 2602
69ef921b 2603 ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1da177e4 2604#endif
aafc3c24 2605
1da177e4 2606 /*
aafc3c24
BF
2607 * Compute byte offsets for the first and last fields in the first
2608 * region and log the agi buffer. This only logs up through
2609 * agi_unlinked.
1da177e4 2610 */
aafc3c24
BF
2611 if (fields & XFS_AGI_ALL_BITS_R1) {
2612 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2613 &first, &last);
2614 xfs_trans_log_buf(tp, bp, first, last);
2615 }
2616
1da177e4 2617 /*
aafc3c24
BF
2618 * Mask off the bits in the first region and calculate the first and
2619 * last field offsets for any bits in the second region.
1da177e4 2620 */
aafc3c24
BF
2621 fields &= ~XFS_AGI_ALL_BITS_R1;
2622 if (fields) {
2623 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2624 &first, &last);
2625 xfs_trans_log_buf(tp, bp, first, last);
2626 }
1da177e4
LT
2627}
2628
a6a781a5 2629static xfs_failaddr_t
612cfbfe 2630xfs_agi_verify(
2d7d1e7e 2631 struct xfs_buf *bp)
3702ce6e 2632{
2d7d1e7e
DW
2633 struct xfs_mount *mp = bp->b_mount;
2634 struct xfs_agi *agi = bp->b_addr;
2635 xfs_failaddr_t fa;
2636 uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno);
2637 uint32_t agi_length = be32_to_cpu(agi->agi_length);
2638 int i;
3702ce6e 2639
38c26bfd 2640 if (xfs_has_crc(mp)) {
a45086e2 2641 if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
a6a781a5 2642 return __this_address;
370c782b 2643 if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
a6a781a5 2644 return __this_address;
a45086e2
BF
2645 }
2646
3702ce6e
DC
2647 /*
2648 * Validate the magic number of the agi block.
2649 */
39708c20 2650 if (!xfs_verify_magic(bp, agi->agi_magicnum))
a6a781a5 2651 return __this_address;
983d09ff 2652 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
a6a781a5 2653 return __this_address;
3702ce6e 2654
2d7d1e7e
DW
2655 fa = xfs_validate_ag_length(bp, agi_seqno, agi_length);
2656 if (fa)
2657 return fa;
2658
d2a047f3 2659 if (be32_to_cpu(agi->agi_level) < 1 ||
973975b7 2660 be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
a6a781a5 2661 return __this_address;
d2a047f3 2662
38c26bfd 2663 if (xfs_has_finobt(mp) &&
d2a047f3 2664 (be32_to_cpu(agi->agi_free_level) < 1 ||
973975b7 2665 be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
a6a781a5 2666 return __this_address;
d2a047f3 2667
9f96cc95 2668 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
5089eaff 2669 if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
9f96cc95
DC
2670 continue;
2671 if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
2672 return __this_address;
2673 }
2674
a6a781a5 2675 return NULL;
612cfbfe
DC
2676}
2677
1813dd64
DC
2678static void
2679xfs_agi_read_verify(
612cfbfe
DC
2680 struct xfs_buf *bp)
2681{
dbd329f1 2682 struct xfs_mount *mp = bp->b_mount;
bc1a09b8 2683 xfs_failaddr_t fa;
983d09ff 2684
38c26bfd 2685 if (xfs_has_crc(mp) &&
ce5028cf 2686 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
bc1a09b8
DW
2687 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2688 else {
2689 fa = xfs_agi_verify(bp);
2690 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
2691 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2692 }
612cfbfe
DC
2693}
2694
b0f539de 2695static void
1813dd64 2696xfs_agi_write_verify(
612cfbfe
DC
2697 struct xfs_buf *bp)
2698{
dbd329f1 2699 struct xfs_mount *mp = bp->b_mount;
fb1755a6 2700 struct xfs_buf_log_item *bip = bp->b_log_item;
370c782b 2701 struct xfs_agi *agi = bp->b_addr;
bc1a09b8 2702 xfs_failaddr_t fa;
983d09ff 2703
bc1a09b8
DW
2704 fa = xfs_agi_verify(bp);
2705 if (fa) {
2706 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
983d09ff
DC
2707 return;
2708 }
2709
38c26bfd 2710 if (!xfs_has_crc(mp))
983d09ff
DC
2711 return;
2712
2713 if (bip)
370c782b 2714 agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
f1dbcd7e 2715 xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
3702ce6e
DC
2716}
2717
1813dd64 2718const struct xfs_buf_ops xfs_agi_buf_ops = {
233135b7 2719 .name = "xfs_agi",
39708c20 2720 .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
1813dd64
DC
2721 .verify_read = xfs_agi_read_verify,
2722 .verify_write = xfs_agi_write_verify,
b5572597 2723 .verify_struct = xfs_agi_verify,
1813dd64
DC
2724};
2725
1da177e4
LT
2726/*
2727 * Read in the allocation group header (inode allocation section)
2728 */
2729int
5e1be0fb 2730xfs_read_agi(
61021deb
DC
2731 struct xfs_perag *pag,
2732 struct xfs_trans *tp,
549d3c9a 2733 xfs_buf_flags_t flags,
61021deb 2734 struct xfs_buf **agibpp)
1da177e4 2735{
61021deb 2736 struct xfs_mount *mp = pag->pag_mount;
5e1be0fb 2737 int error;
1da177e4 2738
61021deb 2739 trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
5e1be0fb
CH
2740
2741 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
61021deb 2742 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
549d3c9a 2743 XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
de6077ec
DW
2744 if (xfs_metadata_is_sick(error))
2745 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
1da177e4
LT
2746 if (error)
2747 return error;
200237d6 2748 if (tp)
61021deb 2749 xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
5e1be0fb 2750
61021deb 2751 xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
5e1be0fb
CH
2752 return 0;
2753}
2754
a95fee40
DC
2755/*
2756 * Read in the agi and initialise the per-ag data. If the caller supplies a
2757 * @agibpp, return the locked AGI buffer to them, otherwise release it.
2758 */
5e1be0fb
CH
2759int
2760xfs_ialloc_read_agi(
99b13c7f
DC
2761 struct xfs_perag *pag,
2762 struct xfs_trans *tp,
549d3c9a 2763 int flags,
a95fee40 2764 struct xfs_buf **agibpp)
5e1be0fb 2765{
a95fee40 2766 struct xfs_buf *agibp;
99b13c7f 2767 struct xfs_agi *agi;
5e1be0fb
CH
2768 int error;
2769
99b13c7f 2770 trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
d123031a 2771
549d3c9a
DW
2772 error = xfs_read_agi(pag, tp,
2773 (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2774 &agibp);
5e1be0fb
CH
2775 if (error)
2776 return error;
2777
a95fee40 2778 agi = agibp->b_addr;
7ac2ff8b 2779 if (!xfs_perag_initialised_agi(pag)) {
16259e7d 2780 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
92821e2b 2781 pag->pagi_count = be32_to_cpu(agi->agi_count);
7ac2ff8b 2782 set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
1da177e4 2783 }
1da177e4 2784
5e1be0fb
CH
2785 /*
2786 * It's possible for these to be out of sync if
2787 * we are in the middle of a forced shutdown.
2788 */
2789 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
99b13c7f 2790 xfs_is_shutdown(pag->pag_mount));
a95fee40
DC
2791 if (agibpp)
2792 *agibpp = agibp;
2793 else
2794 xfs_trans_brelse(tp, agibp);
92821e2b
DC
2795 return 0;
2796}
91fb9afc 2797
efc0845f
DW
2798/* How many inodes are backed by inode clusters ondisk? */
2799STATIC int
2800xfs_ialloc_count_ondisk(
2801 struct xfs_btree_cur *cur,
2802 xfs_agino_t low,
2803 xfs_agino_t high,
2804 unsigned int *allocated)
2e001266
DW
2805{
2806 struct xfs_inobt_rec_incore irec;
efc0845f
DW
2807 unsigned int ret = 0;
2808 int has_record;
2809 int error;
2e001266 2810
2e001266 2811 error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
efc0845f
DW
2812 if (error)
2813 return error;
2814
2815 while (has_record) {
2816 unsigned int i, hole_idx;
2817
2e001266 2818 error = xfs_inobt_get_rec(cur, &irec, &has_record);
efc0845f
DW
2819 if (error)
2820 return error;
2821 if (irec.ir_startino > high)
2e001266
DW
2822 break;
2823
efc0845f
DW
2824 for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
2825 if (irec.ir_startino + i < low)
2e001266 2826 continue;
efc0845f
DW
2827 if (irec.ir_startino + i > high)
2828 break;
2829
2830 hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT;
2831 if (!(irec.ir_holemask & (1U << hole_idx)))
2832 ret++;
2e001266
DW
2833 }
2834
2835 error = xfs_btree_increment(cur, 0, &has_record);
efc0845f
DW
2836 if (error)
2837 return error;
2e001266 2838 }
efc0845f
DW
2839
2840 *allocated = ret;
2841 return 0;
2e001266
DW
2842}
2843
2844/* Is there an inode record covering a given extent? */
2845int
2846xfs_ialloc_has_inodes_at_extent(
2847 struct xfs_btree_cur *cur,
2848 xfs_agblock_t bno,
2849 xfs_extlen_t len,
efc0845f 2850 enum xbtree_recpacking *outcome)
2e001266 2851{
efc0845f
DW
2852 xfs_agino_t agino;
2853 xfs_agino_t last_agino;
2854 unsigned int allocated;
2855 int error;
2e001266 2856
efc0845f
DW
2857 agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
2858 last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
2e001266 2859
efc0845f
DW
2860 error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated);
2861 if (error)
2862 return error;
2863
2864 if (allocated == 0)
2865 *outcome = XBTREE_RECPACKING_EMPTY;
2866 else if (allocated == last_agino - agino + 1)
2867 *outcome = XBTREE_RECPACKING_FULL;
2868 else
2869 *outcome = XBTREE_RECPACKING_SPARSE;
2870 return 0;
2e001266
DW
2871}
2872
2873struct xfs_ialloc_count_inodes {
2874 xfs_agino_t count;
2875 xfs_agino_t freecount;
2876};
2877
2878/* Record inode counts across all inobt records. */
2879STATIC int
2880xfs_ialloc_count_inodes_rec(
2881 struct xfs_btree_cur *cur,
159eb69d 2882 const union xfs_btree_rec *rec,
2e001266
DW
2883 void *priv)
2884{
2885 struct xfs_inobt_rec_incore irec;
2886 struct xfs_ialloc_count_inodes *ci = priv;
ee12eaaa 2887 xfs_failaddr_t fa;
2e001266
DW
2888
2889 xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
dbfbf3bd 2890 fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
ee12eaaa
DW
2891 if (fa)
2892 return xfs_inobt_complain_bad_rec(cur, fa, &irec);
366a0b8d 2893
2e001266
DW
2894 ci->count += irec.ir_count;
2895 ci->freecount += irec.ir_freecount;
2896
2897 return 0;
2898}
2899
2900/* Count allocated and free inodes under an inobt. */
2901int
2902xfs_ialloc_count_inodes(
2903 struct xfs_btree_cur *cur,
2904 xfs_agino_t *count,
2905 xfs_agino_t *freecount)
2906{
2907 struct xfs_ialloc_count_inodes ci = {0};
2908 int error;
2909
ec793e69 2910 ASSERT(xfs_btree_is_ino(cur->bc_ops));
2e001266
DW
2911 error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
2912 if (error)
2913 return error;
2914
2915 *count = ci.count;
2916 *freecount = ci.freecount;
2917 return 0;
2918}
494dba7b
DW
2919
2920/*
2921 * Initialize inode-related geometry information.
2922 *
2923 * Compute the inode btree min and max levels and set maxicount.
2924 *
2925 * Set the inode cluster size. This may still be overridden by the file
2926 * system block size if it is larger than the chosen cluster size.
2927 *
2928 * For v5 filesystems, scale the cluster size with the inode size to keep a
2929 * constant ratio of inode per cluster buffer, but only if mkfs has set the
2930 * inode alignment value appropriately for larger cluster sizes.
2931 *
2932 * Then compute the inode cluster alignment information.
2933 */
2934void
2935xfs_ialloc_setup_geometry(
2936 struct xfs_mount *mp)
2937{
2938 struct xfs_sb *sbp = &mp->m_sb;
2939 struct xfs_ino_geometry *igeo = M_IGEO(mp);
2940 uint64_t icount;
2941 uint inodes;
2942
f93e5436 2943 igeo->new_diflags2 = 0;
ebd9027d 2944 if (xfs_has_bigtime(mp))
f93e5436 2945 igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
9b7d16e3
CB
2946 if (xfs_has_large_extent_counts(mp))
2947 igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
f93e5436 2948
494dba7b
DW
2949 /* Compute inode btree geometry. */
2950 igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
2951 igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
2952 igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
2953 igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
2954 igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
2955
2956 igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
2957 sbp->sb_inopblock);
2958 igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
2959
2960 if (sbp->sb_spino_align)
2961 igeo->ialloc_min_blks = sbp->sb_spino_align;
2962 else
2963 igeo->ialloc_min_blks = igeo->ialloc_blks;
2964
2965 /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2966 inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2967 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2968 inodes);
0ed5f735 2969 ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
494dba7b 2970
c94613fe
DW
2971 /*
2972 * Set the maximum inode count for this filesystem, being careful not
2973 * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
2974 * users should never get here due to failing sb verification, but
2975 * certain users (xfs_db) need to be usable even with corrupt metadata.
2976 */
2977 if (sbp->sb_imax_pct && igeo->ialloc_blks) {
494dba7b
DW
2978 /*
2979 * Make sure the maximum inode count is a multiple
2980 * of the units we allocate inodes in.
2981 */
2982 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
2983 do_div(icount, 100);
2984 do_div(icount, igeo->ialloc_blks);
2985 igeo->maxicount = XFS_FSB_TO_INO(mp,
2986 icount * igeo->ialloc_blks);
2987 } else {
2988 igeo->maxicount = 0;
2989 }
2990
490d451f
DW
2991 /*
2992 * Compute the desired size of an inode cluster buffer size, which
2993 * starts at 8K and (on v5 filesystems) scales up with larger inode
2994 * sizes.
2995 *
2996 * Preserve the desired inode cluster size because the sparse inodes
2997 * feature uses that desired size (not the actual size) to compute the
2998 * sparse inode alignment. The mount code validates this value, so we
2999 * cannot change the behavior.
3000 */
3001 igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
ebd9027d 3002 if (xfs_has_v3inodes(mp)) {
490d451f 3003 int new_size = igeo->inode_cluster_size_raw;
494dba7b
DW
3004
3005 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
3006 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
490d451f 3007 igeo->inode_cluster_size_raw = new_size;
494dba7b
DW
3008 }
3009
3010 /* Calculate inode cluster ratios. */
490d451f 3011 if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
494dba7b 3012 igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
490d451f 3013 igeo->inode_cluster_size_raw);
494dba7b
DW
3014 else
3015 igeo->blocks_per_cluster = 1;
490d451f 3016 igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
494dba7b
DW
3017 igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
3018
3019 /* Calculate inode cluster alignment. */
ebd9027d 3020 if (xfs_has_align(mp) &&
494dba7b
DW
3021 mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
3022 igeo->cluster_align = mp->m_sb.sb_inoalignmt;
3023 else
3024 igeo->cluster_align = 1;
3025 igeo->inoalign_mask = igeo->cluster_align - 1;
3026 igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
3027
3028 /*
3029 * If we are using stripe alignment, check whether
3030 * the stripe unit is a multiple of the inode alignment
3031 */
3032 if (mp->m_dalign && igeo->inoalign_mask &&
3033 !(mp->m_dalign & igeo->inoalign_mask))
3034 igeo->ialloc_align = mp->m_dalign;
3035 else
3036 igeo->ialloc_align = 0;
3037}
13eaec4b
DW
3038
3039/* Compute the location of the root directory inode that is laid out by mkfs. */
3040xfs_ino_t
3041xfs_ialloc_calc_rootino(
3042 struct xfs_mount *mp,
3043 int sunit)
3044{
3045 struct xfs_ino_geometry *igeo = M_IGEO(mp);
3046 xfs_agblock_t first_bno;
3047
3048 /*
3049 * Pre-calculate the geometry of AG 0. We know what it looks like
3050 * because libxfs knows how to create allocation groups now.
3051 *
3052 * first_bno is the first block in which mkfs could possibly have
3053 * allocated the root directory inode, once we factor in the metadata
3054 * that mkfs formats before it. Namely, the four AG headers...
3055 */
3056 first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
3057
3058 /* ...the two free space btree roots... */
3059 first_bno += 2;
3060
3061 /* ...the inode btree root... */
3062 first_bno += 1;
3063
3064 /* ...the initial AGFL... */
3065 first_bno += xfs_alloc_min_freelist(mp, NULL);
3066
3067 /* ...the free inode btree root... */
ebd9027d 3068 if (xfs_has_finobt(mp))
13eaec4b
DW
3069 first_bno++;
3070
3071 /* ...the reverse mapping btree root... */
ebd9027d 3072 if (xfs_has_rmapbt(mp))
13eaec4b
DW
3073 first_bno++;
3074
3075 /* ...the reference count btree... */
ebd9027d 3076 if (xfs_has_reflink(mp))
13eaec4b
DW
3077 first_bno++;
3078
3079 /*
3080 * ...and the log, if it is allocated in the first allocation group.
3081 *
3082 * This can happen with filesystems that only have a single
3083 * allocation group, or very odd geometries created by old mkfs
3084 * versions on very small filesystems.
3085 */
36029dee 3086 if (xfs_ag_contains_log(mp, 0))
13eaec4b
DW
3087 first_bno += mp->m_sb.sb_logblocks;
3088
3089 /*
3090 * Now round first_bno up to whatever allocation alignment is given
3091 * by the filesystem or was passed in.
3092 */
ebd9027d 3093 if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
13eaec4b 3094 first_bno = roundup(first_bno, sunit);
ebd9027d 3095 else if (xfs_has_align(mp) &&
13eaec4b
DW
3096 mp->m_sb.sb_inoalignmt > 1)
3097 first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
3098
3099 return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
3100}
da062d16
DW
3101
3102/*
3103 * Ensure there are not sparse inode clusters that cross the new EOAG.
3104 *
3105 * This is a no-op for non-spinode filesystems since clusters are always fully
3106 * allocated and checking the bnobt suffices. However, a spinode filesystem
3107 * could have a record where the upper inodes are free blocks. If those blocks
3108 * were removed from the filesystem, the inode record would extend beyond EOAG,
3109 * which will be flagged as corruption.
3110 */
3111int
3112xfs_ialloc_check_shrink(
dedab3e4 3113 struct xfs_perag *pag,
da062d16 3114 struct xfs_trans *tp,
da062d16
DW
3115 struct xfs_buf *agibp,
3116 xfs_agblock_t new_length)
3117{
3118 struct xfs_inobt_rec_incore rec;
3119 struct xfs_btree_cur *cur;
bab8b795 3120 xfs_agino_t agino;
da062d16
DW
3121 int has;
3122 int error;
3123
bab8b795 3124 if (!xfs_has_sparseinodes(pag->pag_mount))
da062d16
DW
3125 return 0;
3126
14dd46cf 3127 cur = xfs_inobt_init_cursor(pag, tp, agibp);
da062d16
DW
3128
3129 /* Look up the inobt record that would correspond to the new EOFS. */
bab8b795 3130 agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
da062d16
DW
3131 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
3132 if (error || !has)
3133 goto out;
3134
3135 error = xfs_inobt_get_rec(cur, &rec, &has);
3136 if (error)
3137 goto out;
3138
3139 if (!has) {
baf44fa5 3140 xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
da062d16
DW
3141 error = -EFSCORRUPTED;
3142 goto out;
3143 }
3144
3145 /* If the record covers inodes that would be beyond EOFS, bail out. */
3146 if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
3147 error = -ENOSPC;
3148 goto out;
3149 }
3150out:
3151 xfs_btree_del_cursor(cur, error);
da062d16
DW
3152 return error;
3153}
This page took 2.149706 seconds and 4 git commands to generate.