]> Git Repo - linux.git/blame - fs/xfs/libxfs/xfs_ialloc.c
xfs: make the calculation generic in xfs_sb_validate_fsb_count()
[linux.git] / fs / xfs / libxfs / xfs_ialloc.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
7b718769
NS
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
1da177e4 5 */
1da177e4 6#include "xfs.h"
a844f451 7#include "xfs_fs.h"
70a9883c 8#include "xfs_shared.h"
239880ef
DC
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
a844f451 12#include "xfs_bit.h"
1da177e4 13#include "xfs_mount.h"
1da177e4 14#include "xfs_inode.h"
a844f451
NS
15#include "xfs_btree.h"
16#include "xfs_ialloc.h"
a4fbe6ab 17#include "xfs_ialloc_btree.h"
1da177e4 18#include "xfs_alloc.h"
e9e899a2 19#include "xfs_errortag.h"
1da177e4
LT
20#include "xfs_error.h"
21#include "xfs_bmap.h"
239880ef 22#include "xfs_trans.h"
983d09ff 23#include "xfs_buf_item.h"
ddf6ad01 24#include "xfs_icreate_item.h"
7bb85ef3 25#include "xfs_icache.h"
d123031a 26#include "xfs_trace.h"
a45086e2 27#include "xfs_log.h"
340785cc 28#include "xfs_rmap.h"
9bbafc71 29#include "xfs_ag.h"
de6077ec 30#include "xfs_health.h"
1da177e4 31
fe033cc8 32/*
21875505 33 * Lookup a record by ino in the btree given by cur.
fe033cc8 34 */
81e25176 35int /* error */
21875505 36xfs_inobt_lookup(
fe033cc8
CH
37 struct xfs_btree_cur *cur, /* btree cursor */
38 xfs_agino_t ino, /* starting inode of chunk */
21875505 39 xfs_lookup_t dir, /* <=, >=, == */
fe033cc8
CH
40 int *stat) /* success/failure */
41{
42 cur->bc_rec.i.ir_startino = ino;
5419040f
BF
43 cur->bc_rec.i.ir_holemask = 0;
44 cur->bc_rec.i.ir_count = 0;
21875505
CH
45 cur->bc_rec.i.ir_freecount = 0;
46 cur->bc_rec.i.ir_free = 0;
47 return xfs_btree_lookup(cur, dir, stat);
fe033cc8
CH
48}
49
278d0ca1 50/*
afabc24a 51 * Update the record referred to by cur to the value given.
278d0ca1
CH
52 * This either works (return 0) or gets an EFSCORRUPTED error.
53 */
54STATIC int /* error */
55xfs_inobt_update(
56 struct xfs_btree_cur *cur, /* btree cursor */
afabc24a 57 xfs_inobt_rec_incore_t *irec) /* btree record */
278d0ca1
CH
58{
59 union xfs_btree_rec rec;
60
afabc24a 61 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
38c26bfd 62 if (xfs_has_sparseinodes(cur->bc_mp)) {
5419040f
BF
63 rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
64 rec.inobt.ir_u.sp.ir_count = irec->ir_count;
65 rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
66 } else {
67 /* ir_holemask/ir_count not supported on-disk */
68 rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
69 }
afabc24a 70 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
278d0ca1
CH
71 return xfs_btree_update(cur, &rec);
72}
73
e936945e
DW
74/* Convert on-disk btree record to incore inobt record. */
75void
76xfs_inobt_btrec_to_irec(
77 struct xfs_mount *mp,
159eb69d 78 const union xfs_btree_rec *rec,
e936945e 79 struct xfs_inobt_rec_incore *irec)
8cc938fe 80{
5419040f 81 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
38c26bfd 82 if (xfs_has_sparseinodes(mp)) {
5419040f
BF
83 irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
84 irec->ir_count = rec->inobt.ir_u.sp.ir_count;
85 irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
86 } else {
87 /*
88 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
89 * values for full inode chunks.
90 */
91 irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
92 irec->ir_count = XFS_INODES_PER_CHUNK;
93 irec->ir_freecount =
94 be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
8cc938fe 95 }
5419040f 96 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
e936945e
DW
97}
98
dbfbf3bd
DW
99/* Compute the freecount of an incore inode record. */
100uint8_t
101xfs_inobt_rec_freecount(
102 const struct xfs_inobt_rec_incore *irec)
103{
104 uint64_t realfree = irec->ir_free;
105
106 if (xfs_inobt_issparse(irec->ir_holemask))
107 realfree &= xfs_inobt_irec_to_allocmask(irec);
108 return hweight64(realfree);
109}
110
366a0b8d
DW
111/* Simple checks for inode records. */
112xfs_failaddr_t
113xfs_inobt_check_irec(
dbfbf3bd 114 struct xfs_perag *pag,
366a0b8d
DW
115 const struct xfs_inobt_rec_incore *irec)
116{
de1a9ce2 117 /* Record has to be properly aligned within the AG. */
dbfbf3bd 118 if (!xfs_verify_agino(pag, irec->ir_startino))
366a0b8d 119 return __this_address;
dbfbf3bd 120 if (!xfs_verify_agino(pag,
de1a9ce2
DW
121 irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
122 return __this_address;
366a0b8d
DW
123 if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
124 irec->ir_count > XFS_INODES_PER_CHUNK)
125 return __this_address;
126 if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
127 return __this_address;
128
dbfbf3bd 129 if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
366a0b8d
DW
130 return __this_address;
131
132 return NULL;
133}
134
ee12eaaa
DW
135static inline int
136xfs_inobt_complain_bad_rec(
137 struct xfs_btree_cur *cur,
138 xfs_failaddr_t fa,
139 const struct xfs_inobt_rec_incore *irec)
140{
141 struct xfs_mount *mp = cur->bc_mp;
142
143 xfs_warn(mp,
77953b97
CH
144 "%sbt record corruption in AG %d detected at %pS!",
145 cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
ee12eaaa
DW
146 xfs_warn(mp,
147"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
148 irec->ir_startino, irec->ir_count, irec->ir_freecount,
149 irec->ir_free, irec->ir_holemask);
a78d10f4 150 xfs_btree_mark_sick(cur);
ee12eaaa
DW
151 return -EFSCORRUPTED;
152}
153
e936945e
DW
154/*
155 * Get the data from the pointed-to record.
156 */
157int
158xfs_inobt_get_rec(
159 struct xfs_btree_cur *cur,
160 struct xfs_inobt_rec_incore *irec,
161 int *stat)
162{
9e6c08d4 163 struct xfs_mount *mp = cur->bc_mp;
e936945e 164 union xfs_btree_rec *rec;
366a0b8d 165 xfs_failaddr_t fa;
e936945e
DW
166 int error;
167
168 error = xfs_btree_get_rec(cur, &rec, stat);
169 if (error || *stat == 0)
170 return error;
171
9e6c08d4 172 xfs_inobt_btrec_to_irec(mp, rec, irec);
dbfbf3bd 173 fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
366a0b8d 174 if (fa)
ee12eaaa 175 return xfs_inobt_complain_bad_rec(cur, fa, irec);
5419040f
BF
176
177 return 0;
8cc938fe
CH
178}
179
0aa0a756
BF
180/*
181 * Insert a single inobt record. Cursor must already point to desired location.
182 */
7f8f1313 183int
0aa0a756
BF
184xfs_inobt_insert_rec(
185 struct xfs_btree_cur *cur,
c8ce540d
DW
186 uint16_t holemask,
187 uint8_t count,
188 int32_t freecount,
0aa0a756
BF
189 xfs_inofree_t free,
190 int *stat)
191{
5419040f
BF
192 cur->bc_rec.i.ir_holemask = holemask;
193 cur->bc_rec.i.ir_count = count;
0aa0a756
BF
194 cur->bc_rec.i.ir_freecount = freecount;
195 cur->bc_rec.i.ir_free = free;
196 return xfs_btree_insert(cur, stat);
197}
198
199/*
200 * Insert records describing a newly allocated inode chunk into the inobt.
201 */
202STATIC int
203xfs_inobt_insert(
dedab3e4 204 struct xfs_perag *pag,
0aa0a756
BF
205 struct xfs_trans *tp,
206 struct xfs_buf *agbp,
207 xfs_agino_t newino,
208 xfs_agino_t newlen,
fbeef4e0 209 bool is_finobt)
0aa0a756
BF
210{
211 struct xfs_btree_cur *cur;
0aa0a756
BF
212 xfs_agino_t thisino;
213 int i;
214 int error;
215
fbeef4e0 216 if (is_finobt)
14dd46cf
CH
217 cur = xfs_finobt_init_cursor(pag, tp, agbp);
218 else
219 cur = xfs_inobt_init_cursor(pag, tp, agbp);
0aa0a756
BF
220
221 for (thisino = newino;
222 thisino < newino + newlen;
223 thisino += XFS_INODES_PER_CHUNK) {
224 error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
225 if (error) {
226 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
227 return error;
228 }
229 ASSERT(i == 0);
230
5419040f
BF
231 error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
232 XFS_INODES_PER_CHUNK,
233 XFS_INODES_PER_CHUNK,
0aa0a756
BF
234 XFS_INOBT_ALL_FREE, &i);
235 if (error) {
236 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
237 return error;
238 }
239 ASSERT(i == 1);
240 }
241
242 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
243
244 return 0;
245}
246
0b48db80
DC
247/*
248 * Verify that the number of free inodes in the AGI is correct.
249 */
250#ifdef DEBUG
9ba0889e 251static int
0b48db80 252xfs_check_agi_freecount(
9ba0889e 253 struct xfs_btree_cur *cur)
0b48db80
DC
254{
255 if (cur->bc_nlevels == 1) {
256 xfs_inobt_rec_incore_t rec;
257 int freecount = 0;
258 int error;
259 int i;
260
21875505 261 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
0b48db80
DC
262 if (error)
263 return error;
264
265 do {
266 error = xfs_inobt_get_rec(cur, &rec, &i);
267 if (error)
268 return error;
269
270 if (i) {
271 freecount += rec.ir_freecount;
272 error = xfs_btree_increment(cur, 0, &i);
273 if (error)
274 return error;
275 }
276 } while (i == 1);
277
75c8c50f 278 if (!xfs_is_shutdown(cur->bc_mp))
9ba0889e 279 ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
0b48db80
DC
280 }
281 return 0;
282}
283#else
9ba0889e 284#define xfs_check_agi_freecount(cur) 0
0b48db80
DC
285#endif
286
85c0b2ab 287/*
28c8e41a
DC
288 * Initialise a new set of inodes. When called without a transaction context
289 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
290 * than logging them (which in a transaction context puts them into the AIL
291 * for writeback rather than the xfsbufd queue).
85c0b2ab 292 */
ddf6ad01 293int
85c0b2ab
DC
294xfs_ialloc_inode_init(
295 struct xfs_mount *mp,
296 struct xfs_trans *tp,
28c8e41a 297 struct list_head *buffer_list,
463958af 298 int icount,
85c0b2ab
DC
299 xfs_agnumber_t agno,
300 xfs_agblock_t agbno,
301 xfs_agblock_t length,
302 unsigned int gen)
303{
304 struct xfs_buf *fbuf;
305 struct xfs_dinode *free;
83dcdb44 306 int nbufs;
85c0b2ab
DC
307 int version;
308 int i, j;
309 xfs_daddr_t d;
93848a99 310 xfs_ino_t ino = 0;
ce92464c 311 int error;
85c0b2ab
DC
312
313 /*
6e0c7b8c
JL
314 * Loop over the new block(s), filling in the inodes. For small block
315 * sizes, manipulate the inodes in buffers which are multiples of the
316 * blocks size.
85c0b2ab 317 */
ef325959 318 nbufs = length / M_IGEO(mp)->blocks_per_cluster;
85c0b2ab
DC
319
320 /*
93848a99
CH
321 * Figure out what version number to use in the inodes we create. If
322 * the superblock version has caught up to the one that supports the new
323 * inode format, then use the new inode version. Otherwise use the old
324 * version so that old kernels will continue to be able to use the file
325 * system.
326 *
327 * For v3 inodes, we also need to write the inode number into the inode,
328 * so calculate the first inode number of the chunk here as
43004b2a 329 * XFS_AGB_TO_AGINO() only works within a filesystem block, not
93848a99
CH
330 * across multiple filesystem blocks (such as a cluster) and so cannot
331 * be used in the cluster buffer loop below.
332 *
333 * Further, because we are writing the inode directly into the buffer
334 * and calculating a CRC on the entire inode, we have ot log the entire
335 * inode so that the entire range the CRC covers is present in the log.
336 * That means for v3 inode we log the entire buffer rather than just the
337 * inode cores.
85c0b2ab 338 */
ebd9027d 339 if (xfs_has_v3inodes(mp)) {
93848a99 340 version = 3;
43004b2a 341 ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
ddf6ad01
DC
342
343 /*
344 * log the initialisation that is about to take place as an
345 * logical operation. This means the transaction does not
346 * need to log the physical changes to the inode buffers as log
347 * recovery will know what initialisation is actually needed.
348 * Hence we only need to log the buffers as "ordered" buffers so
349 * they track in the AIL as if they were physically logged.
350 */
351 if (tp)
463958af 352 xfs_icreate_log(tp, agno, agbno, icount,
ddf6ad01 353 mp->m_sb.sb_inodesize, length, gen);
263997a6 354 } else
85c0b2ab 355 version = 2;
85c0b2ab
DC
356
357 for (j = 0; j < nbufs; j++) {
358 /*
359 * Get the block.
360 */
83dcdb44 361 d = XFS_AGB_TO_DADDR(mp, agno, agbno +
ef325959 362 (j * M_IGEO(mp)->blocks_per_cluster));
ce92464c
DW
363 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
364 mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
365 XBF_UNMAPPED, &fbuf);
366 if (error)
367 return error;
ddf6ad01
DC
368
369 /* Initialize the inode buffers and log them appropriately. */
1813dd64 370 fbuf->b_ops = &xfs_inode_buf_ops;
93848a99 371 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
ef325959 372 for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
85c0b2ab 373 int ioffset = i << mp->m_sb.sb_inodelog;
85c0b2ab
DC
374
375 free = xfs_make_iptr(mp, fbuf, i);
376 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
377 free->di_version = version;
378 free->di_gen = cpu_to_be32(gen);
379 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
93848a99
CH
380
381 if (version == 3) {
382 free->di_ino = cpu_to_be64(ino);
383 ino++;
ce748eaa
ES
384 uuid_copy(&free->di_uuid,
385 &mp->m_sb.sb_meta_uuid);
93848a99 386 xfs_dinode_calc_crc(mp, free);
28c8e41a 387 } else if (tp) {
93848a99
CH
388 /* just log the inode core */
389 xfs_trans_log_buf(tp, fbuf, ioffset,
cf28e17c 390 ioffset + XFS_DINODE_SIZE(mp) - 1);
93848a99
CH
391 }
392 }
28c8e41a
DC
393
394 if (tp) {
395 /*
396 * Mark the buffer as an inode allocation buffer so it
397 * sticks in AIL at the point of this allocation
398 * transaction. This ensures the they are on disk before
399 * the tail of the log can be moved past this
400 * transaction (i.e. by preventing relogging from moving
401 * it forward in the log).
402 */
403 xfs_trans_inode_alloc_buf(tp, fbuf);
404 if (version == 3) {
ddf6ad01
DC
405 /*
406 * Mark the buffer as ordered so that they are
407 * not physically logged in the transaction but
408 * still tracked in the AIL as part of the
409 * transaction and pin the log appropriately.
410 */
411 xfs_trans_ordered_buf(tp, fbuf);
28c8e41a
DC
412 }
413 } else {
414 fbuf->b_flags |= XBF_DONE;
415 xfs_buf_delwri_queue(fbuf, buffer_list);
416 xfs_buf_relse(fbuf);
85c0b2ab 417 }
85c0b2ab 418 }
2a30f36d 419 return 0;
85c0b2ab
DC
420}
421
56d1115c
BF
422/*
423 * Align startino and allocmask for a recently allocated sparse chunk such that
424 * they are fit for insertion (or merge) into the on-disk inode btrees.
425 *
426 * Background:
427 *
428 * When enabled, sparse inode support increases the inode alignment from cluster
429 * size to inode chunk size. This means that the minimum range between two
430 * non-adjacent inode records in the inobt is large enough for a full inode
431 * record. This allows for cluster sized, cluster aligned block allocation
432 * without need to worry about whether the resulting inode record overlaps with
433 * another record in the tree. Without this basic rule, we would have to deal
434 * with the consequences of overlap by potentially undoing recent allocations in
435 * the inode allocation codepath.
436 *
437 * Because of this alignment rule (which is enforced on mount), there are two
438 * inobt possibilities for newly allocated sparse chunks. One is that the
439 * aligned inode record for the chunk covers a range of inodes not already
440 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
441 * other is that a record already exists at the aligned startino that considers
442 * the newly allocated range as sparse. In the latter case, record content is
443 * merged in hope that sparse inode chunks fill to full chunks over time.
444 */
445STATIC void
446xfs_align_sparse_ino(
447 struct xfs_mount *mp,
448 xfs_agino_t *startino,
449 uint16_t *allocmask)
450{
451 xfs_agblock_t agbno;
452 xfs_agblock_t mod;
453 int offset;
454
455 agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
456 mod = agbno % mp->m_sb.sb_inoalignmt;
457 if (!mod)
458 return;
459
460 /* calculate the inode offset and align startino */
43004b2a 461 offset = XFS_AGB_TO_AGINO(mp, mod);
56d1115c
BF
462 *startino -= offset;
463
464 /*
465 * Since startino has been aligned down, left shift allocmask such that
466 * it continues to represent the same physical inodes relative to the
467 * new startino.
468 */
469 *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
470}
471
472/*
473 * Determine whether the source inode record can merge into the target. Both
474 * records must be sparse, the inode ranges must match and there must be no
475 * allocation overlap between the records.
476 */
477STATIC bool
478__xfs_inobt_can_merge(
479 struct xfs_inobt_rec_incore *trec, /* tgt record */
480 struct xfs_inobt_rec_incore *srec) /* src record */
481{
482 uint64_t talloc;
483 uint64_t salloc;
484
485 /* records must cover the same inode range */
486 if (trec->ir_startino != srec->ir_startino)
487 return false;
488
489 /* both records must be sparse */
490 if (!xfs_inobt_issparse(trec->ir_holemask) ||
491 !xfs_inobt_issparse(srec->ir_holemask))
492 return false;
493
494 /* both records must track some inodes */
495 if (!trec->ir_count || !srec->ir_count)
496 return false;
497
498 /* can't exceed capacity of a full record */
499 if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
500 return false;
501
502 /* verify there is no allocation overlap */
503 talloc = xfs_inobt_irec_to_allocmask(trec);
504 salloc = xfs_inobt_irec_to_allocmask(srec);
505 if (talloc & salloc)
506 return false;
507
508 return true;
509}
510
511/*
512 * Merge the source inode record into the target. The caller must call
513 * __xfs_inobt_can_merge() to ensure the merge is valid.
514 */
515STATIC void
516__xfs_inobt_rec_merge(
517 struct xfs_inobt_rec_incore *trec, /* target */
518 struct xfs_inobt_rec_incore *srec) /* src */
519{
520 ASSERT(trec->ir_startino == srec->ir_startino);
521
522 /* combine the counts */
523 trec->ir_count += srec->ir_count;
524 trec->ir_freecount += srec->ir_freecount;
525
526 /*
527 * Merge the holemask and free mask. For both fields, 0 bits refer to
528 * allocated inodes. We combine the allocated ranges with bitwise AND.
529 */
530 trec->ir_holemask &= srec->ir_holemask;
531 trec->ir_free &= srec->ir_free;
532}
533
534/*
8541a7d9
CH
535 * Insert a new sparse inode chunk into the associated inode allocation btree.
536 * The inode record for the sparse chunk is pre-aligned to a startino that
537 * should match any pre-existing sparse inode record in the tree. This allows
538 * sparse chunks to fill over time.
56d1115c 539 *
8541a7d9
CH
540 * If no preexisting record exists, the provided record is inserted.
541 * If there is a preexisting record, the provided record is merged with the
56d1115c 542 * existing record and updated in place. The merged record is returned in nrec.
56d1115c
BF
543 *
544 * It is considered corruption if a merge is requested and not possible. Given
545 * the sparse inode alignment constraints, this should never happen.
546 */
547STATIC int
548xfs_inobt_insert_sprec(
dedab3e4 549 struct xfs_perag *pag,
56d1115c
BF
550 struct xfs_trans *tp,
551 struct xfs_buf *agbp,
8541a7d9 552 struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
56d1115c 553{
dedab3e4 554 struct xfs_mount *mp = pag->pag_mount;
56d1115c 555 struct xfs_btree_cur *cur;
56d1115c
BF
556 int error;
557 int i;
558 struct xfs_inobt_rec_incore rec;
559
14dd46cf 560 cur = xfs_inobt_init_cursor(pag, tp, agbp);
56d1115c
BF
561
562 /* the new record is pre-aligned so we know where to look */
563 error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
564 if (error)
565 goto error;
566 /* if nothing there, insert a new record and return */
567 if (i == 0) {
568 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
569 nrec->ir_count, nrec->ir_freecount,
570 nrec->ir_free, &i);
571 if (error)
572 goto error;
f9e03706 573 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 574 xfs_btree_mark_sick(cur);
f9e03706
DW
575 error = -EFSCORRUPTED;
576 goto error;
577 }
56d1115c
BF
578
579 goto out;
580 }
581
582 /*
8541a7d9 583 * A record exists at this startino. Merge the records.
56d1115c 584 */
8541a7d9
CH
585 error = xfs_inobt_get_rec(cur, &rec, &i);
586 if (error)
587 goto error;
588 if (XFS_IS_CORRUPT(mp, i != 1)) {
589 xfs_btree_mark_sick(cur);
590 error = -EFSCORRUPTED;
591 goto error;
592 }
593 if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
594 xfs_btree_mark_sick(cur);
595 error = -EFSCORRUPTED;
596 goto error;
597 }
56d1115c 598
8541a7d9
CH
599 /*
600 * This should never fail. If we have coexisting records that
601 * cannot merge, something is seriously wrong.
602 */
603 if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
604 xfs_btree_mark_sick(cur);
605 error = -EFSCORRUPTED;
606 goto error;
607 }
56d1115c 608
8541a7d9
CH
609 trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
610 rec.ir_holemask, nrec->ir_startino,
611 nrec->ir_holemask);
56d1115c 612
8541a7d9
CH
613 /* merge to nrec to output the updated record */
614 __xfs_inobt_rec_merge(nrec, &rec);
56d1115c 615
8541a7d9
CH
616 trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
617 nrec->ir_holemask);
56d1115c 618
8541a7d9
CH
619 error = xfs_inobt_rec_check_count(mp, nrec);
620 if (error)
621 goto error;
56d1115c
BF
622
623 error = xfs_inobt_update(cur, nrec);
624 if (error)
625 goto error;
626
627out:
628 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
629 return 0;
630error:
631 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
632 return error;
633}
634
8541a7d9
CH
635/*
636 * Insert a new sparse inode chunk into the free inode btree. The inode
637 * record for the sparse chunk is pre-aligned to a startino that should match
638 * any pre-existing sparse inode record in the tree. This allows sparse chunks
639 * to fill over time.
640 *
641 * The new record is always inserted, overwriting a pre-existing record if
642 * there is one.
643 */
644STATIC int
645xfs_finobt_insert_sprec(
646 struct xfs_perag *pag,
647 struct xfs_trans *tp,
648 struct xfs_buf *agbp,
649 struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */
650{
651 struct xfs_mount *mp = pag->pag_mount;
652 struct xfs_btree_cur *cur;
653 int error;
654 int i;
655
14dd46cf 656 cur = xfs_finobt_init_cursor(pag, tp, agbp);
8541a7d9
CH
657
658 /* the new record is pre-aligned so we know where to look */
659 error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
660 if (error)
661 goto error;
662 /* if nothing there, insert a new record and return */
663 if (i == 0) {
664 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
665 nrec->ir_count, nrec->ir_freecount,
666 nrec->ir_free, &i);
667 if (error)
668 goto error;
669 if (XFS_IS_CORRUPT(mp, i != 1)) {
670 xfs_btree_mark_sick(cur);
671 error = -EFSCORRUPTED;
672 goto error;
673 }
674 } else {
675 error = xfs_inobt_update(cur, nrec);
676 if (error)
677 goto error;
678 }
679
680 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
681 return 0;
682error:
683 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
684 return error;
685}
686
687
1da177e4 688/*
8237fbf5
DC
689 * Allocate new inodes in the allocation group specified by agbp. Returns 0 if
690 * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
691 * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
692 * inode count threshold, or the usual negative error code for other errors.
1da177e4 693 */
ef325959 694STATIC int
1da177e4 695xfs_ialloc_ag_alloc(
dedab3e4 696 struct xfs_perag *pag,
ef325959 697 struct xfs_trans *tp,
dedab3e4 698 struct xfs_buf *agbp)
1da177e4 699{
ef325959
DW
700 struct xfs_agi *agi;
701 struct xfs_alloc_arg args;
ef325959
DW
702 int error;
703 xfs_agino_t newino; /* new first inode's number */
704 xfs_agino_t newlen; /* new number of inodes */
705 int isaligned = 0; /* inode allocation at stripe */
706 /* unit boundary */
707 /* init. to full chunk */
56d1115c 708 struct xfs_inobt_rec_incore rec;
ef325959 709 struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
7b13c515 710 uint16_t allocmask = (uint16_t) -1;
ef325959 711 int do_sparse = 0;
1cdadee1 712
a0041684 713 memset(&args, 0, sizeof(args));
1da177e4
LT
714 args.tp = tp;
715 args.mp = tp->t_mountp;
1cdadee1 716 args.fsbno = NULLFSBLOCK;
7280feda 717 args.oinfo = XFS_RMAP_OINFO_INODES;
74c36a86 718 args.pag = pag;
1da177e4 719
46fc58da
BF
720#ifdef DEBUG
721 /* randomly do sparse inode allocations */
ebd9027d 722 if (xfs_has_sparseinodes(tp->t_mountp) &&
ef325959 723 igeo->ialloc_min_blks < igeo->ialloc_blks)
8032bf12 724 do_sparse = get_random_u32_below(2);
46fc58da
BF
725#endif
726
1da177e4
LT
727 /*
728 * Locking will ensure that we don't have two callers in here
729 * at one time.
730 */
ef325959
DW
731 newlen = igeo->ialloc_inos;
732 if (igeo->maxicount &&
74f9ce1c 733 percpu_counter_read_positive(&args.mp->m_icount) + newlen >
ef325959 734 igeo->maxicount)
2451337d 735 return -ENOSPC;
ef325959 736 args.minlen = args.maxlen = igeo->ialloc_blks;
1da177e4 737 /*
3ccb8b5f
GO
738 * First try to allocate inodes contiguous with the last-allocated
739 * chunk of inodes. If the filesystem is striped, this will fill
740 * an entire stripe unit with inodes.
28c8e41a 741 */
370c782b 742 agi = agbp->b_addr;
3ccb8b5f 743 newino = be32_to_cpu(agi->agi_newino);
019ff2d5 744 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
ef325959 745 igeo->ialloc_blks;
1cdadee1
BF
746 if (do_sparse)
747 goto sparse_alloc;
019ff2d5
NS
748 if (likely(newino != NULLAGINO &&
749 (args.agbno < be32_to_cpu(agi->agi_length)))) {
3ccb8b5f 750 args.prod = 1;
75de2a91 751
3ccb8b5f 752 /*
75de2a91
DC
753 * We need to take into account alignment here to ensure that
754 * we don't modify the free list if we fail to have an exact
755 * block. If we don't have an exact match, and every oher
756 * attempt allocation attempt fails, we'll end up cancelling
757 * a dirty transaction and shutting down.
758 *
759 * For an exact allocation, alignment must be 1,
760 * however we need to take cluster alignment into account when
761 * fixing up the freelist. Use the minalignslop field to
762 * indicate that extra blocks might be required for alignment,
763 * but not to use them in the actual exact allocation.
3ccb8b5f 764 */
75de2a91 765 args.alignment = 1;
ef325959 766 args.minalignslop = igeo->cluster_align - 1;
75de2a91
DC
767
768 /* Allow space for the inode btree to split. */
657f1019 769 args.minleft = igeo->inobt_maxlevels;
5f36b2ce
DC
770 error = xfs_alloc_vextent_exact_bno(&args,
771 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
772 args.agbno));
74c36a86 773 if (error)
3ccb8b5f 774 return error;
e480a723
BF
775
776 /*
777 * This request might have dirtied the transaction if the AG can
778 * satisfy the request, but the exact block was not available.
779 * If the allocation did fail, subsequent requests will relax
780 * the exact agbno requirement and increase the alignment
781 * instead. It is critical that the total size of the request
782 * (len + alignment + slop) does not increase from this point
783 * on, so reset minalignslop to ensure it is not included in
784 * subsequent requests.
785 */
786 args.minalignslop = 0;
1cdadee1 787 }
1da177e4 788
3ccb8b5f
GO
789 if (unlikely(args.fsbno == NULLFSBLOCK)) {
790 /*
791 * Set the alignment for the allocation.
792 * If stripe alignment is turned on then align at stripe unit
793 * boundary.
019ff2d5
NS
794 * If the cluster size is smaller than a filesystem block
795 * then we're doing I/O for inodes in filesystem block size
3ccb8b5f
GO
796 * pieces, so don't need alignment anyway.
797 */
798 isaligned = 0;
ef325959 799 if (igeo->ialloc_align) {
0560f31a 800 ASSERT(!xfs_has_noalign(args.mp));
3ccb8b5f
GO
801 args.alignment = args.mp->m_dalign;
802 isaligned = 1;
75de2a91 803 } else
ef325959 804 args.alignment = igeo->cluster_align;
3ccb8b5f
GO
805 /*
806 * Allocate a fixed-size extent of inodes.
807 */
3ccb8b5f
GO
808 args.prod = 1;
809 /*
810 * Allow space for the inode btree to split.
811 */
657f1019 812 args.minleft = igeo->inobt_maxlevels;
db4710fd
DC
813 error = xfs_alloc_vextent_near_bno(&args,
814 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
815 be32_to_cpu(agi->agi_root)));
74c36a86 816 if (error)
3ccb8b5f
GO
817 return error;
818 }
019ff2d5 819
1da177e4
LT
820 /*
821 * If stripe alignment is turned on, then try again with cluster
822 * alignment.
823 */
824 if (isaligned && args.fsbno == NULLFSBLOCK) {
ef325959 825 args.alignment = igeo->cluster_align;
db4710fd
DC
826 error = xfs_alloc_vextent_near_bno(&args,
827 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
828 be32_to_cpu(agi->agi_root)));
829 if (error)
1da177e4
LT
830 return error;
831 }
832
56d1115c
BF
833 /*
834 * Finally, try a sparse allocation if the filesystem supports it and
835 * the sparse allocation length is smaller than a full chunk.
836 */
ebd9027d 837 if (xfs_has_sparseinodes(args.mp) &&
ef325959 838 igeo->ialloc_min_blks < igeo->ialloc_blks &&
56d1115c 839 args.fsbno == NULLFSBLOCK) {
1cdadee1 840sparse_alloc:
56d1115c
BF
841 args.alignment = args.mp->m_sb.sb_spino_align;
842 args.prod = 1;
843
ef325959 844 args.minlen = igeo->ialloc_min_blks;
56d1115c
BF
845 args.maxlen = args.minlen;
846
847 /*
848 * The inode record will be aligned to full chunk size. We must
849 * prevent sparse allocation from AG boundaries that result in
850 * invalid inode records, such as records that start at agbno 0
851 * or extend beyond the AG.
852 *
853 * Set min agbno to the first aligned, non-zero agbno and max to
854 * the last aligned agbno that is at least one full chunk from
855 * the end of the AG.
856 */
857 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
858 args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
859 args.mp->m_sb.sb_inoalignmt) -
ef325959 860 igeo->ialloc_blks;
56d1115c 861
db4710fd
DC
862 error = xfs_alloc_vextent_near_bno(&args,
863 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
864 be32_to_cpu(agi->agi_root)));
56d1115c
BF
865 if (error)
866 return error;
867
43004b2a 868 newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
46fc58da 869 ASSERT(newlen <= XFS_INODES_PER_CHUNK);
56d1115c
BF
870 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
871 }
872
3937493c 873 if (args.fsbno == NULLFSBLOCK)
8237fbf5 874 return -EAGAIN;
3937493c 875
1da177e4 876 ASSERT(args.len == args.minlen);
1da177e4 877
359346a9 878 /*
85c0b2ab
DC
879 * Stamp and write the inode buffers.
880 *
359346a9
DC
881 * Seed the new inode cluster with a random generation number. This
882 * prevents short-term reuse of generation numbers if a chunk is
883 * freed and then immediately reallocated. We use random numbers
884 * rather than a linear progression to prevent the next generation
885 * number from being easily guessable.
886 */
7b13c515 887 error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
a251c17a 888 args.agbno, args.len, get_random_u32());
d42f08f6 889
2a30f36d
CS
890 if (error)
891 return error;
85c0b2ab
DC
892 /*
893 * Convert the results.
894 */
43004b2a 895 newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
56d1115c
BF
896
897 if (xfs_inobt_issparse(~allocmask)) {
898 /*
899 * We've allocated a sparse chunk. Align the startino and mask.
900 */
901 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
902
903 rec.ir_startino = newino;
904 rec.ir_holemask = ~allocmask;
905 rec.ir_count = newlen;
906 rec.ir_freecount = newlen;
907 rec.ir_free = XFS_INOBT_ALL_FREE;
908
909 /*
910 * Insert the sparse record into the inobt and allow for a merge
911 * if necessary. If a merge does occur, rec is updated to the
912 * merged record.
913 */
8541a7d9 914 error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
56d1115c
BF
915 if (error == -EFSCORRUPTED) {
916 xfs_alert(args.mp,
917 "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
7b13c515 918 XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
56d1115c
BF
919 rec.ir_startino),
920 rec.ir_holemask, rec.ir_count);
921 xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
922 }
923 if (error)
924 return error;
925
926 /*
927 * We can't merge the part we've just allocated as for the inobt
928 * due to finobt semantics. The original record may or may not
929 * exist independent of whether physical inodes exist in this
930 * sparse chunk.
931 *
932 * We must update the finobt record based on the inobt record.
933 * rec contains the fully merged and up to date inobt record
934 * from the previous call. Set merge false to replace any
935 * existing record with this one.
936 */
ebd9027d 937 if (xfs_has_finobt(args.mp)) {
8541a7d9 938 error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
56d1115c
BF
939 if (error)
940 return error;
941 }
942 } else {
943 /* full chunk - insert new records to both btrees */
fbeef4e0 944 error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
56d1115c
BF
945 if (error)
946 return error;
947
ebd9027d 948 if (xfs_has_finobt(args.mp)) {
dedab3e4 949 error = xfs_inobt_insert(pag, tp, agbp, newino,
fbeef4e0 950 newlen, true);
56d1115c
BF
951 if (error)
952 return error;
953 }
954 }
955
956 /*
957 * Update AGI counts and newino.
958 */
413d57c9
MS
959 be32_add_cpu(&agi->agi_count, newlen);
960 be32_add_cpu(&agi->agi_freecount, newlen);
44b56e0a 961 pag->pagi_freecount += newlen;
89e9b5c0 962 pag->pagi_count += newlen;
16259e7d 963 agi->agi_newino = cpu_to_be32(newino);
85c0b2ab 964
1da177e4
LT
965 /*
966 * Log allocation group header fields
967 */
968 xfs_ialloc_log_agi(tp, agbp,
969 XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
970 /*
971 * Modify/log superblock values for inode count and inode free count.
972 */
973 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
974 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
1da177e4
LT
975 return 0;
976}
977
4254b0bb
CH
978/*
979 * Try to retrieve the next record to the left/right from the current one.
980 */
981STATIC int
982xfs_ialloc_next_rec(
983 struct xfs_btree_cur *cur,
984 xfs_inobt_rec_incore_t *rec,
985 int *done,
986 int left)
987{
988 int error;
989 int i;
990
991 if (left)
992 error = xfs_btree_decrement(cur, 0, &i);
993 else
994 error = xfs_btree_increment(cur, 0, &i);
995
996 if (error)
997 return error;
998 *done = !i;
999 if (i) {
1000 error = xfs_inobt_get_rec(cur, rec, &i);
1001 if (error)
1002 return error;
989d5ec3
DW
1003 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1004 xfs_btree_mark_sick(cur);
f9e03706 1005 return -EFSCORRUPTED;
989d5ec3 1006 }
4254b0bb
CH
1007 }
1008
1009 return 0;
1010}
1011
bd169565
DC
1012STATIC int
1013xfs_ialloc_get_rec(
1014 struct xfs_btree_cur *cur,
1015 xfs_agino_t agino,
1016 xfs_inobt_rec_incore_t *rec,
43df2ee6 1017 int *done)
bd169565
DC
1018{
1019 int error;
1020 int i;
1021
1022 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
1023 if (error)
1024 return error;
1025 *done = !i;
1026 if (i) {
1027 error = xfs_inobt_get_rec(cur, rec, &i);
1028 if (error)
1029 return error;
989d5ec3
DW
1030 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1031 xfs_btree_mark_sick(cur);
f9e03706 1032 return -EFSCORRUPTED;
989d5ec3 1033 }
bd169565
DC
1034 }
1035
1036 return 0;
1037}
0b48db80 1038
d4cc540b 1039/*
26dd5217
BF
1040 * Return the offset of the first free inode in the record. If the inode chunk
1041 * is sparsely allocated, we convert the record holemask to inode granularity
1042 * and mask off the unallocated regions from the inode free mask.
d4cc540b
BF
1043 */
1044STATIC int
1045xfs_inobt_first_free_inode(
1046 struct xfs_inobt_rec_incore *rec)
1047{
26dd5217
BF
1048 xfs_inofree_t realfree;
1049
1050 /* if there are no holes, return the first available offset */
1051 if (!xfs_inobt_issparse(rec->ir_holemask))
1052 return xfs_lowbit64(rec->ir_free);
1053
1054 realfree = xfs_inobt_irec_to_allocmask(rec);
1055 realfree &= rec->ir_free;
1056
1057 return xfs_lowbit64(realfree);
d4cc540b
BF
1058}
1059
2935213a
DW
1060/*
1061 * If this AG has corrupt inodes, check if allocating this inode would fail
1062 * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
1063 * somewhere else.
1064 */
1065static int
1066xfs_dialloc_check_ino(
1067 struct xfs_perag *pag,
1068 struct xfs_trans *tp,
1069 xfs_ino_t ino)
1070{
1071 struct xfs_imap imap;
1072 struct xfs_buf *bp;
1073 int error;
1074
1075 error = xfs_imap(pag, tp, ino, &imap, 0);
1076 if (error)
1077 return -EAGAIN;
1078
1079 error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
1080 if (error)
1081 return -EAGAIN;
1082
1083 xfs_trans_brelse(tp, bp);
1084 return 0;
1085}
1086
1da177e4 1087/*
6dd8638e 1088 * Allocate an inode using the inobt-only algorithm.
1da177e4 1089 */
f2ecc5e4 1090STATIC int
6dd8638e 1091xfs_dialloc_ag_inobt(
dedab3e4 1092 struct xfs_perag *pag,
f2ecc5e4
CH
1093 struct xfs_trans *tp,
1094 struct xfs_buf *agbp,
1095 xfs_ino_t parent,
1096 xfs_ino_t *inop)
1da177e4 1097{
f2ecc5e4 1098 struct xfs_mount *mp = tp->t_mountp;
370c782b 1099 struct xfs_agi *agi = agbp->b_addr;
f2ecc5e4
CH
1100 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
1101 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
f2ecc5e4
CH
1102 struct xfs_btree_cur *cur, *tcur;
1103 struct xfs_inobt_rec_incore rec, trec;
1104 xfs_ino_t ino;
1105 int error;
1106 int offset;
1107 int i, j;
2d32311c 1108 int searchdistance = 10;
1da177e4 1109
7ac2ff8b
DC
1110 ASSERT(xfs_perag_initialised_agi(pag));
1111 ASSERT(xfs_perag_allows_inodes(pag));
4bb61069
CH
1112 ASSERT(pag->pagi_freecount > 0);
1113
bd169565 1114 restart_pagno:
14dd46cf 1115 cur = xfs_inobt_init_cursor(pag, tp, agbp);
1da177e4
LT
1116 /*
1117 * If pagino is 0 (this is the root inode allocation) use newino.
1118 * This must work because we've just allocated some.
1119 */
1120 if (!pagino)
16259e7d 1121 pagino = be32_to_cpu(agi->agi_newino);
1da177e4 1122
9ba0889e 1123 error = xfs_check_agi_freecount(cur);
0b48db80
DC
1124 if (error)
1125 goto error0;
1da177e4 1126
1da177e4 1127 /*
4254b0bb 1128 * If in the same AG as the parent, try to get near the parent.
1da177e4 1129 */
7b13c515 1130 if (pagno == pag->pag_agno) {
4254b0bb
CH
1131 int doneleft; /* done, to the left */
1132 int doneright; /* done, to the right */
1133
21875505 1134 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
4254b0bb 1135 if (error)
1da177e4 1136 goto error0;
f9e03706 1137 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1138 xfs_btree_mark_sick(cur);
f9e03706
DW
1139 error = -EFSCORRUPTED;
1140 goto error0;
1141 }
4254b0bb
CH
1142
1143 error = xfs_inobt_get_rec(cur, &rec, &j);
1144 if (error)
1145 goto error0;
f9e03706 1146 if (XFS_IS_CORRUPT(mp, j != 1)) {
989d5ec3 1147 xfs_btree_mark_sick(cur);
f9e03706
DW
1148 error = -EFSCORRUPTED;
1149 goto error0;
1150 }
4254b0bb
CH
1151
1152 if (rec.ir_freecount > 0) {
1da177e4
LT
1153 /*
1154 * Found a free inode in the same chunk
4254b0bb 1155 * as the parent, done.
1da177e4 1156 */
4254b0bb 1157 goto alloc_inode;
1da177e4 1158 }
4254b0bb
CH
1159
1160
1da177e4 1161 /*
4254b0bb 1162 * In the same AG as parent, but parent's chunk is full.
1da177e4 1163 */
1da177e4 1164
4254b0bb
CH
1165 /* duplicate the cursor, search left & right simultaneously */
1166 error = xfs_btree_dup_cursor(cur, &tcur);
1167 if (error)
1168 goto error0;
1169
bd169565
DC
1170 /*
1171 * Skip to last blocks looked up if same parent inode.
1172 */
1173 if (pagino != NULLAGINO &&
1174 pag->pagl_pagino == pagino &&
1175 pag->pagl_leftrec != NULLAGINO &&
1176 pag->pagl_rightrec != NULLAGINO) {
1177 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
43df2ee6 1178 &trec, &doneleft);
bd169565
DC
1179 if (error)
1180 goto error1;
4254b0bb 1181
bd169565 1182 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
43df2ee6 1183 &rec, &doneright);
bd169565
DC
1184 if (error)
1185 goto error1;
1186 } else {
1187 /* search left with tcur, back up 1 record */
1188 error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1189 if (error)
1190 goto error1;
1191
1192 /* search right with cur, go forward 1 record. */
1193 error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1194 if (error)
1195 goto error1;
1196 }
4254b0bb
CH
1197
1198 /*
1199 * Loop until we find an inode chunk with a free inode.
1200 */
2d32311c 1201 while (--searchdistance > 0 && (!doneleft || !doneright)) {
4254b0bb
CH
1202 int useleft; /* using left inode chunk this time */
1203
1204 /* figure out the closer block if both are valid. */
1205 if (!doneleft && !doneright) {
1206 useleft = pagino -
1207 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1208 rec.ir_startino - pagino;
1209 } else {
1210 useleft = !doneleft;
1da177e4 1211 }
4254b0bb
CH
1212
1213 /* free inodes to the left? */
1214 if (useleft && trec.ir_freecount) {
4254b0bb
CH
1215 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1216 cur = tcur;
bd169565
DC
1217
1218 pag->pagl_leftrec = trec.ir_startino;
1219 pag->pagl_rightrec = rec.ir_startino;
1220 pag->pagl_pagino = pagino;
c44245b3 1221 rec = trec;
4254b0bb 1222 goto alloc_inode;
1da177e4 1223 }
1da177e4 1224
4254b0bb
CH
1225 /* free inodes to the right? */
1226 if (!useleft && rec.ir_freecount) {
1227 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
bd169565
DC
1228
1229 pag->pagl_leftrec = trec.ir_startino;
1230 pag->pagl_rightrec = rec.ir_startino;
1231 pag->pagl_pagino = pagino;
4254b0bb 1232 goto alloc_inode;
1da177e4 1233 }
4254b0bb
CH
1234
1235 /* get next record to check */
1236 if (useleft) {
1237 error = xfs_ialloc_next_rec(tcur, &trec,
1238 &doneleft, 1);
1239 } else {
1240 error = xfs_ialloc_next_rec(cur, &rec,
1241 &doneright, 0);
1242 }
1243 if (error)
1244 goto error1;
1da177e4 1245 }
bd169565 1246
2d32311c
CM
1247 if (searchdistance <= 0) {
1248 /*
1249 * Not in range - save last search
1250 * location and allocate a new inode
1251 */
1252 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1253 pag->pagl_leftrec = trec.ir_startino;
1254 pag->pagl_rightrec = rec.ir_startino;
1255 pag->pagl_pagino = pagino;
1256
1257 } else {
1258 /*
1259 * We've reached the end of the btree. because
1260 * we are only searching a small chunk of the
1261 * btree each search, there is obviously free
1262 * inodes closer to the parent inode than we
1263 * are now. restart the search again.
1264 */
1265 pag->pagl_pagino = NULLAGINO;
1266 pag->pagl_leftrec = NULLAGINO;
1267 pag->pagl_rightrec = NULLAGINO;
1268 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1269 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1270 goto restart_pagno;
1271 }
1da177e4 1272 }
4254b0bb 1273
1da177e4 1274 /*
4254b0bb 1275 * In a different AG from the parent.
1da177e4
LT
1276 * See if the most recently allocated block has any free.
1277 */
69ef921b 1278 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
21875505
CH
1279 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1280 XFS_LOOKUP_EQ, &i);
4254b0bb 1281 if (error)
1da177e4 1282 goto error0;
4254b0bb
CH
1283
1284 if (i == 1) {
1285 error = xfs_inobt_get_rec(cur, &rec, &j);
1286 if (error)
1287 goto error0;
1288
1289 if (j == 1 && rec.ir_freecount > 0) {
1290 /*
1291 * The last chunk allocated in the group
1292 * still has a free inode.
1293 */
1294 goto alloc_inode;
1295 }
1da177e4 1296 }
bd169565 1297 }
4254b0bb 1298
bd169565
DC
1299 /*
1300 * None left in the last group, search the whole AG
1301 */
1302 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1303 if (error)
1304 goto error0;
f9e03706 1305 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1306 xfs_btree_mark_sick(cur);
f9e03706
DW
1307 error = -EFSCORRUPTED;
1308 goto error0;
1309 }
bd169565
DC
1310
1311 for (;;) {
1312 error = xfs_inobt_get_rec(cur, &rec, &i);
1313 if (error)
1314 goto error0;
f9e03706 1315 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1316 xfs_btree_mark_sick(cur);
f9e03706
DW
1317 error = -EFSCORRUPTED;
1318 goto error0;
1319 }
bd169565
DC
1320 if (rec.ir_freecount > 0)
1321 break;
1322 error = xfs_btree_increment(cur, 0, &i);
4254b0bb
CH
1323 if (error)
1324 goto error0;
f9e03706 1325 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 1326 xfs_btree_mark_sick(cur);
f9e03706
DW
1327 error = -EFSCORRUPTED;
1328 goto error0;
1329 }
1da177e4 1330 }
4254b0bb
CH
1331
1332alloc_inode:
d4cc540b 1333 offset = xfs_inobt_first_free_inode(&rec);
1da177e4
LT
1334 ASSERT(offset >= 0);
1335 ASSERT(offset < XFS_INODES_PER_CHUNK);
1336 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1337 XFS_INODES_PER_CHUNK) == 0);
7b13c515 1338 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
2935213a
DW
1339
1340 if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1341 error = xfs_dialloc_check_ino(pag, tp, ino);
1342 if (error)
1343 goto error0;
1344 }
1345
0d87e656 1346 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1da177e4 1347 rec.ir_freecount--;
afabc24a
CH
1348 error = xfs_inobt_update(cur, &rec);
1349 if (error)
1da177e4 1350 goto error0;
413d57c9 1351 be32_add_cpu(&agi->agi_freecount, -1);
1da177e4 1352 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
44b56e0a 1353 pag->pagi_freecount--;
1da177e4 1354
9ba0889e 1355 error = xfs_check_agi_freecount(cur);
0b48db80
DC
1356 if (error)
1357 goto error0;
1358
1da177e4
LT
1359 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1360 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1361 *inop = ino;
1362 return 0;
1363error1:
1364 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1365error0:
1366 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1367 return error;
1368}
1369
6dd8638e
BF
1370/*
1371 * Use the free inode btree to allocate an inode based on distance from the
1372 * parent. Note that the provided cursor may be deleted and replaced.
1373 */
1374STATIC int
1375xfs_dialloc_ag_finobt_near(
1376 xfs_agino_t pagino,
1377 struct xfs_btree_cur **ocur,
1378 struct xfs_inobt_rec_incore *rec)
1379{
1380 struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
1381 struct xfs_btree_cur *rcur; /* right search cursor */
1382 struct xfs_inobt_rec_incore rrec;
1383 int error;
1384 int i, j;
1385
1386 error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1387 if (error)
1388 return error;
1389
1390 if (i == 1) {
1391 error = xfs_inobt_get_rec(lcur, rec, &i);
1392 if (error)
1393 return error;
989d5ec3
DW
1394 if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
1395 xfs_btree_mark_sick(lcur);
f9e03706 1396 return -EFSCORRUPTED;
989d5ec3 1397 }
6dd8638e
BF
1398
1399 /*
1400 * See if we've landed in the parent inode record. The finobt
1401 * only tracks chunks with at least one free inode, so record
1402 * existence is enough.
1403 */
1404 if (pagino >= rec->ir_startino &&
1405 pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1406 return 0;
1407 }
1408
1409 error = xfs_btree_dup_cursor(lcur, &rcur);
1410 if (error)
1411 return error;
1412
1413 error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1414 if (error)
1415 goto error_rcur;
1416 if (j == 1) {
1417 error = xfs_inobt_get_rec(rcur, &rrec, &j);
1418 if (error)
1419 goto error_rcur;
f9e03706 1420 if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
989d5ec3 1421 xfs_btree_mark_sick(lcur);
f9e03706
DW
1422 error = -EFSCORRUPTED;
1423 goto error_rcur;
1424 }
6dd8638e
BF
1425 }
1426
f9e03706 1427 if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
989d5ec3 1428 xfs_btree_mark_sick(lcur);
f9e03706
DW
1429 error = -EFSCORRUPTED;
1430 goto error_rcur;
1431 }
6dd8638e
BF
1432 if (i == 1 && j == 1) {
1433 /*
1434 * Both the left and right records are valid. Choose the closer
1435 * inode chunk to the target.
1436 */
1437 if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1438 (rrec.ir_startino - pagino)) {
1439 *rec = rrec;
1440 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1441 *ocur = rcur;
1442 } else {
1443 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1444 }
1445 } else if (j == 1) {
1446 /* only the right record is valid */
1447 *rec = rrec;
1448 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1449 *ocur = rcur;
1450 } else if (i == 1) {
1451 /* only the left record is valid */
1452 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1453 }
1454
1455 return 0;
1456
1457error_rcur:
1458 xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1459 return error;
1460}
1461
1462/*
1463 * Use the free inode btree to find a free inode based on a newino hint. If
1464 * the hint is NULL, find the first free inode in the AG.
1465 */
1466STATIC int
1467xfs_dialloc_ag_finobt_newino(
1468 struct xfs_agi *agi,
1469 struct xfs_btree_cur *cur,
1470 struct xfs_inobt_rec_incore *rec)
1471{
1472 int error;
1473 int i;
1474
1475 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
e68ed775
DC
1476 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1477 XFS_LOOKUP_EQ, &i);
6dd8638e
BF
1478 if (error)
1479 return error;
1480 if (i == 1) {
1481 error = xfs_inobt_get_rec(cur, rec, &i);
1482 if (error)
1483 return error;
989d5ec3
DW
1484 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1485 xfs_btree_mark_sick(cur);
f9e03706 1486 return -EFSCORRUPTED;
989d5ec3 1487 }
6dd8638e
BF
1488 return 0;
1489 }
1490 }
1491
1492 /*
1493 * Find the first inode available in the AG.
1494 */
1495 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1496 if (error)
1497 return error;
989d5ec3
DW
1498 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1499 xfs_btree_mark_sick(cur);
f9e03706 1500 return -EFSCORRUPTED;
989d5ec3 1501 }
6dd8638e
BF
1502
1503 error = xfs_inobt_get_rec(cur, rec, &i);
1504 if (error)
1505 return error;
989d5ec3
DW
1506 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1507 xfs_btree_mark_sick(cur);
f9e03706 1508 return -EFSCORRUPTED;
989d5ec3 1509 }
6dd8638e
BF
1510
1511 return 0;
1512}
1513
1514/*
1515 * Update the inobt based on a modification made to the finobt. Also ensure that
1516 * the records from both trees are equivalent post-modification.
1517 */
1518STATIC int
1519xfs_dialloc_ag_update_inobt(
1520 struct xfs_btree_cur *cur, /* inobt cursor */
1521 struct xfs_inobt_rec_incore *frec, /* finobt record */
1522 int offset) /* inode offset */
1523{
1524 struct xfs_inobt_rec_incore rec;
1525 int error;
1526 int i;
1527
1528 error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1529 if (error)
1530 return error;
989d5ec3
DW
1531 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1532 xfs_btree_mark_sick(cur);
f9e03706 1533 return -EFSCORRUPTED;
989d5ec3 1534 }
6dd8638e
BF
1535
1536 error = xfs_inobt_get_rec(cur, &rec, &i);
1537 if (error)
1538 return error;
989d5ec3
DW
1539 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1540 xfs_btree_mark_sick(cur);
f9e03706 1541 return -EFSCORRUPTED;
989d5ec3 1542 }
6dd8638e
BF
1543 ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1544 XFS_INODES_PER_CHUNK) == 0);
1545
1546 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1547 rec.ir_freecount--;
1548
f9e03706
DW
1549 if (XFS_IS_CORRUPT(cur->bc_mp,
1550 rec.ir_free != frec->ir_free ||
989d5ec3
DW
1551 rec.ir_freecount != frec->ir_freecount)) {
1552 xfs_btree_mark_sick(cur);
f9e03706 1553 return -EFSCORRUPTED;
989d5ec3 1554 }
6dd8638e 1555
b72091f2 1556 return xfs_inobt_update(cur, &rec);
6dd8638e
BF
1557}
1558
1559/*
1560 * Allocate an inode using the free inode btree, if available. Otherwise, fall
1561 * back to the inobt search algorithm.
1562 *
1563 * The caller selected an AG for us, and made sure that free inodes are
1564 * available.
1565 */
b652afd9 1566static int
6dd8638e 1567xfs_dialloc_ag(
dedab3e4 1568 struct xfs_perag *pag,
6dd8638e
BF
1569 struct xfs_trans *tp,
1570 struct xfs_buf *agbp,
1571 xfs_ino_t parent,
1572 xfs_ino_t *inop)
1573{
1574 struct xfs_mount *mp = tp->t_mountp;
370c782b 1575 struct xfs_agi *agi = agbp->b_addr;
6dd8638e
BF
1576 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
1577 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
6dd8638e
BF
1578 struct xfs_btree_cur *cur; /* finobt cursor */
1579 struct xfs_btree_cur *icur; /* inobt cursor */
1580 struct xfs_inobt_rec_incore rec;
1581 xfs_ino_t ino;
1582 int error;
1583 int offset;
1584 int i;
1585
ebd9027d 1586 if (!xfs_has_finobt(mp))
dedab3e4 1587 return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop);
6dd8638e 1588
6dd8638e
BF
1589 /*
1590 * If pagino is 0 (this is the root inode allocation) use newino.
1591 * This must work because we've just allocated some.
1592 */
1593 if (!pagino)
1594 pagino = be32_to_cpu(agi->agi_newino);
1595
14dd46cf 1596 cur = xfs_finobt_init_cursor(pag, tp, agbp);
6dd8638e 1597
9ba0889e 1598 error = xfs_check_agi_freecount(cur);
6dd8638e
BF
1599 if (error)
1600 goto error_cur;
1601
1602 /*
1603 * The search algorithm depends on whether we're in the same AG as the
1604 * parent. If so, find the closest available inode to the parent. If
1605 * not, consider the agi hint or find the first free inode in the AG.
1606 */
7b13c515 1607 if (pag->pag_agno == pagno)
6dd8638e
BF
1608 error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1609 else
1610 error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1611 if (error)
1612 goto error_cur;
1613
d4cc540b 1614 offset = xfs_inobt_first_free_inode(&rec);
6dd8638e
BF
1615 ASSERT(offset >= 0);
1616 ASSERT(offset < XFS_INODES_PER_CHUNK);
1617 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1618 XFS_INODES_PER_CHUNK) == 0);
7b13c515 1619 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
6dd8638e 1620
2935213a
DW
1621 if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1622 error = xfs_dialloc_check_ino(pag, tp, ino);
1623 if (error)
1624 goto error_cur;
1625 }
1626
6dd8638e
BF
1627 /*
1628 * Modify or remove the finobt record.
1629 */
1630 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1631 rec.ir_freecount--;
1632 if (rec.ir_freecount)
1633 error = xfs_inobt_update(cur, &rec);
1634 else
1635 error = xfs_btree_delete(cur, &i);
1636 if (error)
1637 goto error_cur;
1638
1639 /*
1640 * The finobt has now been updated appropriately. We haven't updated the
1641 * agi and superblock yet, so we can create an inobt cursor and validate
1642 * the original freecount. If all is well, make the equivalent update to
1643 * the inobt using the finobt record and offset information.
1644 */
14dd46cf 1645 icur = xfs_inobt_init_cursor(pag, tp, agbp);
6dd8638e 1646
9ba0889e 1647 error = xfs_check_agi_freecount(icur);
6dd8638e
BF
1648 if (error)
1649 goto error_icur;
1650
1651 error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1652 if (error)
1653 goto error_icur;
1654
1655 /*
1656 * Both trees have now been updated. We must update the perag and
1657 * superblock before we can check the freecount for each btree.
1658 */
1659 be32_add_cpu(&agi->agi_freecount, -1);
1660 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
7b13c515 1661 pag->pagi_freecount--;
6dd8638e
BF
1662
1663 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1664
9ba0889e 1665 error = xfs_check_agi_freecount(icur);
6dd8638e
BF
1666 if (error)
1667 goto error_icur;
9ba0889e 1668 error = xfs_check_agi_freecount(cur);
6dd8638e
BF
1669 if (error)
1670 goto error_icur;
1671
1672 xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1673 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
6dd8638e
BF
1674 *inop = ino;
1675 return 0;
1676
1677error_icur:
1678 xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1679error_cur:
1680 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
6dd8638e
BF
1681 return error;
1682}
1683
f3bf6e0f 1684static int
aececc9f
DC
1685xfs_dialloc_roll(
1686 struct xfs_trans **tpp,
1687 struct xfs_buf *agibp)
1688{
1689 struct xfs_trans *tp = *tpp;
1690 struct xfs_dquot_acct *dqinfo;
1691 int error;
1692
1693 /*
1694 * Hold to on to the agibp across the commit so no other allocation can
1695 * come in and take the free inodes we just allocated for our caller.
1696 */
1697 xfs_trans_bhold(tp, agibp);
1698
1699 /*
1700 * We want the quota changes to be associated with the next transaction,
1701 * NOT this one. So, detach the dqinfo from this and attach it to the
1702 * next transaction.
1703 */
1704 dqinfo = tp->t_dqinfo;
1705 tp->t_dqinfo = NULL;
1706
1707 error = xfs_trans_roll(&tp);
1708
1709 /* Re-attach the quota info that we detached from prev trx. */
1710 tp->t_dqinfo = dqinfo;
1711
8237fbf5
DC
1712 /*
1713 * Join the buffer even on commit error so that the buffer is released
1714 * when the caller cancels the transaction and doesn't have to handle
1715 * this error case specially.
1716 */
aececc9f 1717 xfs_trans_bjoin(tp, agibp);
8237fbf5
DC
1718 *tpp = tp;
1719 return error;
aececc9f
DC
1720}
1721
8237fbf5
DC
1722static bool
1723xfs_dialloc_good_ag(
8237fbf5 1724 struct xfs_perag *pag,
dedab3e4 1725 struct xfs_trans *tp,
8237fbf5
DC
1726 umode_t mode,
1727 int flags,
1728 bool ok_alloc)
1729{
1730 struct xfs_mount *mp = tp->t_mountp;
1731 xfs_extlen_t ineed;
1732 xfs_extlen_t longest = 0;
1733 int needspace;
1734 int error;
1735
dedab3e4
DC
1736 if (!pag)
1737 return false;
7ac2ff8b 1738 if (!xfs_perag_allows_inodes(pag))
8237fbf5
DC
1739 return false;
1740
7ac2ff8b 1741 if (!xfs_perag_initialised_agi(pag)) {
549d3c9a 1742 error = xfs_ialloc_read_agi(pag, tp, 0, NULL);
8237fbf5
DC
1743 if (error)
1744 return false;
1745 }
1746
1747 if (pag->pagi_freecount)
1748 return true;
1749 if (!ok_alloc)
1750 return false;
1751
7ac2ff8b 1752 if (!xfs_perag_initialised_agf(pag)) {
08d3e84f 1753 error = xfs_alloc_read_agf(pag, tp, flags, NULL);
8237fbf5
DC
1754 if (error)
1755 return false;
1756 }
1757
1758 /*
1759 * Check that there is enough free space for the file plus a chunk of
1760 * inodes if we need to allocate some. If this is the first pass across
1761 * the AGs, take into account the potential space needed for alignment
1762 * of inode chunks when checking the longest contiguous free space in
1763 * the AG - this prevents us from getting ENOSPC because we have free
1764 * space larger than ialloc_blks but alignment constraints prevent us
1765 * from using it.
1766 *
1767 * If we can't find an AG with space for full alignment slack to be
1768 * taken into account, we must be near ENOSPC in all AGs. Hence we
1769 * don't include alignment for the second pass and so if we fail
1770 * allocation due to alignment issues then it is most likely a real
1771 * ENOSPC condition.
1772 *
1773 * XXX(dgc): this calculation is now bogus thanks to the per-ag
1774 * reservations that xfs_alloc_fix_freelist() now does via
1775 * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1776 * be more than large enough for the check below to succeed, but
1777 * xfs_alloc_space_available() will fail because of the non-zero
1778 * metadata reservation and hence we won't actually be able to allocate
1779 * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1780 * because of this.
1781 */
1782 ineed = M_IGEO(mp)->ialloc_min_blks;
1783 if (flags && ineed > 1)
1784 ineed += M_IGEO(mp)->cluster_align;
1785 longest = pag->pagf_longest;
1786 if (!longest)
1787 longest = pag->pagf_flcount > 0;
1788 needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
1789
1790 if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
1791 return false;
1792 return true;
1793}
1794
1795static int
1796xfs_dialloc_try_ag(
8237fbf5 1797 struct xfs_perag *pag,
dedab3e4 1798 struct xfs_trans **tpp,
8237fbf5
DC
1799 xfs_ino_t parent,
1800 xfs_ino_t *new_ino,
1801 bool ok_alloc)
1802{
1803 struct xfs_buf *agbp;
1804 xfs_ino_t ino;
1805 int error;
1806
1807 /*
1808 * Then read in the AGI buffer and recheck with the AGI buffer
1809 * lock held.
1810 */
549d3c9a 1811 error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp);
8237fbf5
DC
1812 if (error)
1813 return error;
1814
1815 if (!pag->pagi_freecount) {
1816 if (!ok_alloc) {
1817 error = -EAGAIN;
1818 goto out_release;
1819 }
1820
dedab3e4 1821 error = xfs_ialloc_ag_alloc(pag, *tpp, agbp);
8237fbf5
DC
1822 if (error < 0)
1823 goto out_release;
1824
1825 /*
1826 * We successfully allocated space for an inode cluster in this
1827 * AG. Roll the transaction so that we can allocate one of the
1828 * new inodes.
1829 */
1830 ASSERT(pag->pagi_freecount > 0);
1831 error = xfs_dialloc_roll(tpp, agbp);
1832 if (error)
1833 goto out_release;
1834 }
1835
1836 /* Allocate an inode in the found AG */
dedab3e4 1837 error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino);
8237fbf5
DC
1838 if (!error)
1839 *new_ino = ino;
1840 return error;
1841
1842out_release:
1843 xfs_trans_brelse(*tpp, agbp);
1844 return error;
1845}
1846
f2ecc5e4 1847/*
8237fbf5 1848 * Allocate an on-disk inode.
f2ecc5e4 1849 *
8d822dc3 1850 * Mode is used to tell whether the new inode is a directory and hence where to
8237fbf5
DC
1851 * locate it. The on-disk inode that is allocated will be returned in @new_ino
1852 * on success, otherwise an error will be set to indicate the failure (e.g.
1853 * -ENOSPC).
f2ecc5e4
CH
1854 */
1855int
b652afd9 1856xfs_dialloc(
f3bf6e0f 1857 struct xfs_trans **tpp,
f2ecc5e4
CH
1858 xfs_ino_t parent,
1859 umode_t mode,
b652afd9 1860 xfs_ino_t *new_ino)
f2ecc5e4 1861{
f3bf6e0f 1862 struct xfs_mount *mp = (*tpp)->t_mountp;
f2ecc5e4 1863 xfs_agnumber_t agno;
b652afd9 1864 int error = 0;
be60fe54 1865 xfs_agnumber_t start_agno;
f2ecc5e4 1866 struct xfs_perag *pag;
ef325959 1867 struct xfs_ino_geometry *igeo = M_IGEO(mp);
8237fbf5 1868 bool ok_alloc = true;
f08f984c 1869 bool low_space = false;
89b1f55a 1870 int flags;
76257a15 1871 xfs_ino_t ino = NULLFSINO;
8d822dc3 1872
4bb61069 1873 /*
89b1f55a
DC
1874 * Directories, symlinks, and regular files frequently allocate at least
1875 * one block, so factor that potential expansion when we examine whether
1876 * an AG has enough space for file creation.
4bb61069 1877 */
89b1f55a 1878 if (S_ISDIR(mode))
6e2985c9
DW
1879 start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
1880 mp->m_maxagi;
89b1f55a
DC
1881 else {
1882 start_agno = XFS_INO_TO_AGNO(mp, parent);
1883 if (start_agno >= mp->m_maxagi)
1884 start_agno = 0;
1885 }
55d6af64 1886
f2ecc5e4
CH
1887 /*
1888 * If we have already hit the ceiling of inode blocks then clear
8237fbf5 1889 * ok_alloc so we scan all available agi structures for a free
f2ecc5e4 1890 * inode.
74f9ce1c
GW
1891 *
1892 * Read rough value of mp->m_icount by percpu_counter_read_positive,
1893 * which will sacrifice the preciseness but improve the performance.
f2ecc5e4 1894 */
ef325959
DW
1895 if (igeo->maxicount &&
1896 percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
1897 > igeo->maxicount) {
8237fbf5 1898 ok_alloc = false;
f2ecc5e4
CH
1899 }
1900
f08f984c
DC
1901 /*
1902 * If we are near to ENOSPC, we want to prefer allocation from AGs that
1903 * have free inodes in them rather than use up free space allocating new
1904 * inode chunks. Hence we turn off allocation for the first non-blocking
1905 * pass through the AGs if we are near ENOSPC to consume free inodes
1906 * that we can immediately allocate, but then we allow allocation on the
1907 * second pass if we fail to find an AG with free inodes in it.
1908 */
1909 if (percpu_counter_read_positive(&mp->m_fdblocks) <
1910 mp->m_low_space[XFS_LOWSP_1_PCNT]) {
1911 ok_alloc = false;
1912 low_space = true;
1913 }
1914
f2ecc5e4
CH
1915 /*
1916 * Loop until we find an allocation group that either has free inodes
1917 * or in which we can allocate some inodes. Iterate through the
1918 * allocation groups upward, wrapping at the end.
1919 */
89b1f55a 1920 flags = XFS_ALLOC_FLAG_TRYLOCK;
76257a15
DC
1921retry:
1922 for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) {
dedab3e4
DC
1923 if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) {
1924 error = xfs_dialloc_try_ag(pag, tpp, parent,
8237fbf5
DC
1925 &ino, ok_alloc);
1926 if (error != -EAGAIN)
42685473 1927 break;
76257a15 1928 error = 0;
f2ecc5e4 1929 }
be60fe54 1930
75c8c50f 1931 if (xfs_is_shutdown(mp)) {
89b1f55a 1932 error = -EFSCORRUPTED;
42685473 1933 break;
89b1f55a 1934 }
76257a15
DC
1935 }
1936 if (pag)
1937 xfs_perag_rele(pag);
1938 if (error)
1939 return error;
1940 if (ino == NULLFSINO) {
1941 if (flags) {
89b1f55a 1942 flags = 0;
f08f984c
DC
1943 if (low_space)
1944 ok_alloc = true;
76257a15 1945 goto retry;
89b1f55a 1946 }
76257a15 1947 return -ENOSPC;
f2ecc5e4 1948 }
38fd3d6a
DW
1949
1950 /*
1951 * Protect against obviously corrupt allocation btree records. Later
1952 * xfs_iget checks will catch re-allocation of other active in-memory
1953 * and on-disk inodes. If we don't catch reallocating the parent inode
1954 * here we will deadlock in xfs_iget() so we have to do these checks
1955 * first.
1956 */
1957 if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
1958 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
1959 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
1960 XFS_SICK_AG_INOBT);
1961 return -EFSCORRUPTED;
1962 }
1963
76257a15
DC
1964 *new_ino = ino;
1965 return 0;
f2ecc5e4
CH
1966}
1967
10ae3dc7
BF
1968/*
1969 * Free the blocks of an inode chunk. We must consider that the inode chunk
1970 * might be sparse and only free the regions that are allocated as part of the
1971 * chunk.
1972 */
7dfee17b 1973static int
10ae3dc7 1974xfs_difree_inode_chunk(
0f37d178 1975 struct xfs_trans *tp,
10ae3dc7 1976 xfs_agnumber_t agno,
0f37d178 1977 struct xfs_inobt_rec_incore *rec)
10ae3dc7 1978{
0f37d178
BF
1979 struct xfs_mount *mp = tp->t_mountp;
1980 xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp,
1981 rec->ir_startino);
1982 int startidx, endidx;
1983 int nextbit;
1984 xfs_agblock_t agbno;
1985 int contigblk;
10ae3dc7
BF
1986 DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
1987
1988 if (!xfs_inobt_issparse(rec->ir_holemask)) {
1989 /* not sparse, calculate extent info directly */
7dfee17b
DC
1990 return xfs_free_extent_later(tp,
1991 XFS_AGB_TO_FSB(mp, agno, sagbno),
b742d7b4 1992 M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
980faece 1993 XFS_AG_RESV_NONE, 0);
10ae3dc7
BF
1994 }
1995
1996 /* holemask is only 16-bits (fits in an unsigned long) */
1997 ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
1998 holemask[0] = rec->ir_holemask;
1999
2000 /*
2001 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
2002 * holemask and convert the start/end index of each range to an extent.
2003 * We start with the start and end index both pointing at the first 0 in
2004 * the mask.
2005 */
2006 startidx = endidx = find_first_zero_bit(holemask,
2007 XFS_INOBT_HOLEMASK_BITS);
2008 nextbit = startidx + 1;
2009 while (startidx < XFS_INOBT_HOLEMASK_BITS) {
7dfee17b
DC
2010 int error;
2011
10ae3dc7
BF
2012 nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
2013 nextbit);
2014 /*
2015 * If the next zero bit is contiguous, update the end index of
2016 * the current range and continue.
2017 */
2018 if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
2019 nextbit == endidx + 1) {
2020 endidx = nextbit;
2021 goto next;
2022 }
2023
2024 /*
2025 * nextbit is not contiguous with the current end index. Convert
2026 * the current start/end to an extent and add it to the free
2027 * list.
2028 */
2029 agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
2030 mp->m_sb.sb_inopblock;
2031 contigblk = ((endidx - startidx + 1) *
2032 XFS_INODES_PER_HOLEMASK_BIT) /
2033 mp->m_sb.sb_inopblock;
2034
2035 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
2036 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
7dfee17b 2037 error = xfs_free_extent_later(tp,
b742d7b4 2038 XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
980faece 2039 &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0);
7dfee17b
DC
2040 if (error)
2041 return error;
10ae3dc7
BF
2042
2043 /* reset range to current bit and carry on... */
2044 startidx = endidx = nextbit;
2045
2046next:
2047 nextbit++;
2048 }
7dfee17b 2049 return 0;
10ae3dc7
BF
2050}
2051
2b64ee5c
BF
2052STATIC int
2053xfs_difree_inobt(
dedab3e4 2054 struct xfs_perag *pag,
2b64ee5c
BF
2055 struct xfs_trans *tp,
2056 struct xfs_buf *agbp,
2057 xfs_agino_t agino,
09b56604 2058 struct xfs_icluster *xic,
2b64ee5c 2059 struct xfs_inobt_rec_incore *orec)
1da177e4 2060{
dedab3e4 2061 struct xfs_mount *mp = pag->pag_mount;
370c782b 2062 struct xfs_agi *agi = agbp->b_addr;
2b64ee5c
BF
2063 struct xfs_btree_cur *cur;
2064 struct xfs_inobt_rec_incore rec;
2065 int ilen;
2066 int error;
2067 int i;
2068 int off;
1da177e4 2069
69ef921b 2070 ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2b64ee5c
BF
2071 ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
2072
1da177e4
LT
2073 /*
2074 * Initialize the cursor.
2075 */
14dd46cf 2076 cur = xfs_inobt_init_cursor(pag, tp, agbp);
1da177e4 2077
9ba0889e 2078 error = xfs_check_agi_freecount(cur);
0b48db80
DC
2079 if (error)
2080 goto error0;
2081
1da177e4
LT
2082 /*
2083 * Look for the entry describing this inode.
2084 */
21875505 2085 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
0b932ccc
DC
2086 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
2087 __func__, error);
1da177e4
LT
2088 goto error0;
2089 }
f9e03706 2090 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 2091 xfs_btree_mark_sick(cur);
f9e03706
DW
2092 error = -EFSCORRUPTED;
2093 goto error0;
2094 }
2e287a73
CH
2095 error = xfs_inobt_get_rec(cur, &rec, &i);
2096 if (error) {
0b932ccc
DC
2097 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
2098 __func__, error);
1da177e4
LT
2099 goto error0;
2100 }
f9e03706 2101 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 2102 xfs_btree_mark_sick(cur);
f9e03706
DW
2103 error = -EFSCORRUPTED;
2104 goto error0;
2105 }
1da177e4
LT
2106 /*
2107 * Get the offset in the inode chunk.
2108 */
2109 off = agino - rec.ir_startino;
2110 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
0d87e656 2111 ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1da177e4
LT
2112 /*
2113 * Mark the inode free & increment the count.
2114 */
0d87e656 2115 rec.ir_free |= XFS_INOBT_MASK(off);
1da177e4
LT
2116 rec.ir_freecount++;
2117
2118 /*
999633d3
BF
2119 * When an inode chunk is free, it becomes eligible for removal. Don't
2120 * remove the chunk if the block size is large enough for multiple inode
2121 * chunks (that might not be free).
1da177e4 2122 */
0560f31a 2123 if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
999633d3 2124 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
749f24f3 2125 xic->deleted = true;
7b13c515
DC
2126 xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
2127 rec.ir_startino);
09b56604 2128 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
1da177e4
LT
2129
2130 /*
2131 * Remove the inode cluster from the AGI B+Tree, adjust the
2132 * AGI and Superblock inode counts, and mark the disk space
2133 * to be freed when the transaction is committed.
2134 */
999633d3 2135 ilen = rec.ir_freecount;
413d57c9
MS
2136 be32_add_cpu(&agi->agi_count, -ilen);
2137 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1da177e4 2138 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
44b56e0a 2139 pag->pagi_freecount -= ilen - 1;
89e9b5c0 2140 pag->pagi_count -= ilen;
1da177e4
LT
2141 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
2142 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
2143
91cca5df 2144 if ((error = xfs_btree_delete(cur, &i))) {
0b932ccc
DC
2145 xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
2146 __func__, error);
1da177e4
LT
2147 goto error0;
2148 }
2149
7dfee17b
DC
2150 error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
2151 if (error)
2152 goto error0;
1da177e4 2153 } else {
749f24f3 2154 xic->deleted = false;
1da177e4 2155
afabc24a
CH
2156 error = xfs_inobt_update(cur, &rec);
2157 if (error) {
0b932ccc
DC
2158 xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
2159 __func__, error);
1da177e4
LT
2160 goto error0;
2161 }
afabc24a 2162
b7df7630 2163 /*
1da177e4
LT
2164 * Change the inode free counts and log the ag/sb changes.
2165 */
413d57c9 2166 be32_add_cpu(&agi->agi_freecount, 1);
1da177e4 2167 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
7b13c515 2168 pag->pagi_freecount++;
1da177e4
LT
2169 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2170 }
2171
9ba0889e 2172 error = xfs_check_agi_freecount(cur);
0b48db80
DC
2173 if (error)
2174 goto error0;
1da177e4 2175
2b64ee5c 2176 *orec = rec;
1da177e4
LT
2177 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2178 return 0;
2179
2180error0:
2181 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2182 return error;
2183}
2184
3efa4ffd
BF
2185/*
2186 * Free an inode in the free inode btree.
2187 */
2188STATIC int
2189xfs_difree_finobt(
dedab3e4 2190 struct xfs_perag *pag,
3efa4ffd
BF
2191 struct xfs_trans *tp,
2192 struct xfs_buf *agbp,
2193 xfs_agino_t agino,
2194 struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
2195{
dedab3e4 2196 struct xfs_mount *mp = pag->pag_mount;
3efa4ffd
BF
2197 struct xfs_btree_cur *cur;
2198 struct xfs_inobt_rec_incore rec;
2199 int offset = agino - ibtrec->ir_startino;
2200 int error;
2201 int i;
2202
14dd46cf 2203 cur = xfs_finobt_init_cursor(pag, tp, agbp);
3efa4ffd
BF
2204
2205 error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2206 if (error)
2207 goto error;
2208 if (i == 0) {
2209 /*
2210 * If the record does not exist in the finobt, we must have just
2211 * freed an inode in a previously fully allocated chunk. If not,
2212 * something is out of sync.
2213 */
f9e03706 2214 if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
989d5ec3 2215 xfs_btree_mark_sick(cur);
f9e03706
DW
2216 error = -EFSCORRUPTED;
2217 goto error;
2218 }
3efa4ffd 2219
5419040f
BF
2220 error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2221 ibtrec->ir_count,
2222 ibtrec->ir_freecount,
3efa4ffd
BF
2223 ibtrec->ir_free, &i);
2224 if (error)
2225 goto error;
2226 ASSERT(i == 1);
2227
2228 goto out;
2229 }
2230
2231 /*
2232 * Read and update the existing record. We could just copy the ibtrec
2233 * across here, but that would defeat the purpose of having redundant
2234 * metadata. By making the modifications independently, we can catch
2235 * corruptions that we wouldn't see if we just copied from one record
2236 * to another.
2237 */
2238 error = xfs_inobt_get_rec(cur, &rec, &i);
2239 if (error)
2240 goto error;
f9e03706 2241 if (XFS_IS_CORRUPT(mp, i != 1)) {
989d5ec3 2242 xfs_btree_mark_sick(cur);
f9e03706
DW
2243 error = -EFSCORRUPTED;
2244 goto error;
2245 }
3efa4ffd
BF
2246
2247 rec.ir_free |= XFS_INOBT_MASK(offset);
2248 rec.ir_freecount++;
2249
f9e03706
DW
2250 if (XFS_IS_CORRUPT(mp,
2251 rec.ir_free != ibtrec->ir_free ||
2252 rec.ir_freecount != ibtrec->ir_freecount)) {
989d5ec3 2253 xfs_btree_mark_sick(cur);
f9e03706
DW
2254 error = -EFSCORRUPTED;
2255 goto error;
2256 }
3efa4ffd
BF
2257
2258 /*
2259 * The content of inobt records should always match between the inobt
2260 * and finobt. The lifecycle of records in the finobt is different from
2261 * the inobt in that the finobt only tracks records with at least one
2262 * free inode. Hence, if all of the inodes are free and we aren't
2263 * keeping inode chunks permanently on disk, remove the record.
2264 * Otherwise, update the record with the new information.
999633d3
BF
2265 *
2266 * Note that we currently can't free chunks when the block size is large
2267 * enough for multiple chunks. Leave the finobt record to remain in sync
2268 * with the inobt.
3efa4ffd 2269 */
0560f31a
DC
2270 if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2271 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
3efa4ffd
BF
2272 error = xfs_btree_delete(cur, &i);
2273 if (error)
2274 goto error;
2275 ASSERT(i == 1);
2276 } else {
2277 error = xfs_inobt_update(cur, &rec);
2278 if (error)
2279 goto error;
2280 }
2281
2282out:
9ba0889e 2283 error = xfs_check_agi_freecount(cur);
3efa4ffd
BF
2284 if (error)
2285 goto error;
2286
2287 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2288 return 0;
2289
2290error:
2291 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2292 return error;
2293}
2294
2b64ee5c
BF
2295/*
2296 * Free disk inode. Carefully avoids touching the incore inode, all
2297 * manipulations incore are the caller's responsibility.
2298 * The on-disk inode is not changed by this operation, only the
2299 * btree (free inode mask) is changed.
2300 */
2301int
2302xfs_difree(
f40aadb2
DC
2303 struct xfs_trans *tp,
2304 struct xfs_perag *pag,
2305 xfs_ino_t inode,
2306 struct xfs_icluster *xic)
2b64ee5c
BF
2307{
2308 /* REFERENCED */
2309 xfs_agblock_t agbno; /* block number containing inode */
2310 struct xfs_buf *agbp; /* buffer for allocation group header */
2311 xfs_agino_t agino; /* allocation group inode number */
2b64ee5c 2312 int error; /* error return value */
7b13c515 2313 struct xfs_mount *mp = tp->t_mountp;
2b64ee5c 2314 struct xfs_inobt_rec_incore rec;/* btree record */
2b64ee5c
BF
2315
2316 /*
2317 * Break up inode number into its components.
2318 */
f40aadb2
DC
2319 if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
2320 xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
2321 __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
2b64ee5c 2322 ASSERT(0);
2451337d 2323 return -EINVAL;
2b64ee5c
BF
2324 }
2325 agino = XFS_INO_TO_AGINO(mp, inode);
f40aadb2 2326 if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
2b64ee5c
BF
2327 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
2328 __func__, (unsigned long long)inode,
f40aadb2 2329 (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
2b64ee5c 2330 ASSERT(0);
2451337d 2331 return -EINVAL;
2b64ee5c
BF
2332 }
2333 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2334 if (agbno >= mp->m_sb.sb_agblocks) {
2335 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
2336 __func__, agbno, mp->m_sb.sb_agblocks);
2337 ASSERT(0);
2451337d 2338 return -EINVAL;
2b64ee5c
BF
2339 }
2340 /*
2341 * Get the allocation group header.
2342 */
549d3c9a 2343 error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2b64ee5c
BF
2344 if (error) {
2345 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2346 __func__, error);
2347 return error;
2348 }
2349
2350 /*
2351 * Fix up the inode allocation btree.
2352 */
dedab3e4 2353 error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec);
2b64ee5c
BF
2354 if (error)
2355 goto error0;
2356
3efa4ffd
BF
2357 /*
2358 * Fix up the free inode btree.
2359 */
ebd9027d 2360 if (xfs_has_finobt(mp)) {
dedab3e4 2361 error = xfs_difree_finobt(pag, tp, agbp, agino, &rec);
3efa4ffd
BF
2362 if (error)
2363 goto error0;
2364 }
2365
2b64ee5c
BF
2366 return 0;
2367
2368error0:
2369 return error;
2370}
2371
7124fe0a
DC
2372STATIC int
2373xfs_imap_lookup(
7b13c515 2374 struct xfs_perag *pag,
498f0adb 2375 struct xfs_trans *tp,
7124fe0a
DC
2376 xfs_agino_t agino,
2377 xfs_agblock_t agbno,
2378 xfs_agblock_t *chunk_agbno,
2379 xfs_agblock_t *offset_agbno,
2380 int flags)
2381{
498f0adb 2382 struct xfs_mount *mp = pag->pag_mount;
7124fe0a
DC
2383 struct xfs_inobt_rec_incore rec;
2384 struct xfs_btree_cur *cur;
2385 struct xfs_buf *agbp;
7124fe0a
DC
2386 int error;
2387 int i;
2388
549d3c9a 2389 error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
7124fe0a 2390 if (error) {
53487786
DC
2391 xfs_alert(mp,
2392 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
7b13c515 2393 __func__, error, pag->pag_agno);
7124fe0a
DC
2394 return error;
2395 }
2396
2397 /*
4536f2ad
DC
2398 * Lookup the inode record for the given agino. If the record cannot be
2399 * found, then it's an invalid inode number and we should abort. Once
2400 * we have a record, we need to ensure it contains the inode number
2401 * we are looking up.
7124fe0a 2402 */
14dd46cf 2403 cur = xfs_inobt_init_cursor(pag, tp, agbp);
4536f2ad 2404 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
7124fe0a
DC
2405 if (!error) {
2406 if (i)
2407 error = xfs_inobt_get_rec(cur, &rec, &i);
2408 if (!error && i == 0)
2451337d 2409 error = -EINVAL;
7124fe0a
DC
2410 }
2411
2412 xfs_trans_brelse(tp, agbp);
0b04b6b8 2413 xfs_btree_del_cursor(cur, error);
7124fe0a
DC
2414 if (error)
2415 return error;
2416
4536f2ad
DC
2417 /* check that the returned record contains the required inode */
2418 if (rec.ir_startino > agino ||
ef325959 2419 rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
2451337d 2420 return -EINVAL;
4536f2ad 2421
7124fe0a 2422 /* for untrusted inodes check it is allocated first */
1920779e 2423 if ((flags & XFS_IGET_UNTRUSTED) &&
7124fe0a 2424 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2451337d 2425 return -EINVAL;
7124fe0a
DC
2426
2427 *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2428 *offset_agbno = agbno - *chunk_agbno;
2429 return 0;
2430}
2431
1da177e4 2432/*
94e1b69d 2433 * Return the location of the inode in imap, for mapping it into a buffer.
1da177e4 2434 */
1da177e4 2435int
94e1b69d 2436xfs_imap(
498f0adb
DC
2437 struct xfs_perag *pag,
2438 struct xfs_trans *tp,
7b13c515
DC
2439 xfs_ino_t ino, /* inode to locate */
2440 struct xfs_imap *imap, /* location map structure */
2441 uint flags) /* flags for inode btree lookup */
1da177e4 2442{
498f0adb 2443 struct xfs_mount *mp = pag->pag_mount;
7b13c515
DC
2444 xfs_agblock_t agbno; /* block number of inode in the alloc group */
2445 xfs_agino_t agino; /* inode number within alloc group */
2446 xfs_agblock_t chunk_agbno; /* first block in inode chunk */
2447 xfs_agblock_t cluster_agbno; /* first block in inode cluster */
2448 int error; /* error code */
2449 int offset; /* index of inode in its buffer */
2450 xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
1da177e4
LT
2451
2452 ASSERT(ino != NULLFSINO);
94e1b69d 2453
1da177e4
LT
2454 /*
2455 * Split up the inode number into its parts.
2456 */
1da177e4
LT
2457 agino = XFS_INO_TO_AGINO(mp, ino);
2458 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
498f0adb 2459 if (agbno >= mp->m_sb.sb_agblocks ||
7b13c515
DC
2460 ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
2461 error = -EINVAL;
1da177e4 2462#ifdef DEBUG
1920779e
DC
2463 /*
2464 * Don't output diagnostic information for untrusted inodes
2465 * as they can be invalid without implying corruption.
2466 */
2467 if (flags & XFS_IGET_UNTRUSTED)
498f0adb 2468 return error;
1da177e4 2469 if (agbno >= mp->m_sb.sb_agblocks) {
53487786
DC
2470 xfs_alert(mp,
2471 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2472 __func__, (unsigned long long)agbno,
2473 (unsigned long)mp->m_sb.sb_agblocks);
1da177e4 2474 }
498f0adb 2475 if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
53487786
DC
2476 xfs_alert(mp,
2477 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
2478 __func__, ino,
7b13c515 2479 XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
1da177e4 2480 }
745b1f47 2481 xfs_stack_trace();
1da177e4 2482#endif /* DEBUG */
498f0adb 2483 return error;
1da177e4 2484 }
94e1b69d 2485
7124fe0a
DC
2486 /*
2487 * For bulkstat and handle lookups, we have an untrusted inode number
2488 * that we have to verify is valid. We cannot do this just by reading
2489 * the inode buffer as it may have been unlinked and removed leaving
2490 * inodes in stale state on disk. Hence we have to do a btree lookup
2491 * in all cases where an untrusted inode number is passed.
2492 */
1920779e 2493 if (flags & XFS_IGET_UNTRUSTED) {
498f0adb 2494 error = xfs_imap_lookup(pag, tp, agino, agbno,
7124fe0a
DC
2495 &chunk_agbno, &offset_agbno, flags);
2496 if (error)
498f0adb 2497 return error;
7124fe0a
DC
2498 goto out_map;
2499 }
2500
94e1b69d
CH
2501 /*
2502 * If the inode cluster size is the same as the blocksize or
2503 * smaller we get to the buffer by simple arithmetics.
2504 */
ef325959 2505 if (M_IGEO(mp)->blocks_per_cluster == 1) {
1da177e4
LT
2506 offset = XFS_INO_TO_OFFSET(mp, ino);
2507 ASSERT(offset < mp->m_sb.sb_inopblock);
94e1b69d 2508
7b13c515 2509 imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
94e1b69d 2510 imap->im_len = XFS_FSB_TO_BB(mp, 1);
755c7bf5
DW
2511 imap->im_boffset = (unsigned short)(offset <<
2512 mp->m_sb.sb_inodelog);
498f0adb 2513 return 0;
1da177e4 2514 }
94e1b69d 2515
94e1b69d
CH
2516 /*
2517 * If the inode chunks are aligned then use simple maths to
2518 * find the location. Otherwise we have to do a btree
2519 * lookup to find the location.
2520 */
ef325959
DW
2521 if (M_IGEO(mp)->inoalign_mask) {
2522 offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
1da177e4
LT
2523 chunk_agbno = agbno - offset_agbno;
2524 } else {
498f0adb 2525 error = xfs_imap_lookup(pag, tp, agino, agbno,
7124fe0a 2526 &chunk_agbno, &offset_agbno, flags);
1da177e4 2527 if (error)
498f0adb 2528 return error;
1da177e4 2529 }
94e1b69d 2530
7124fe0a 2531out_map:
1da177e4
LT
2532 ASSERT(agbno >= chunk_agbno);
2533 cluster_agbno = chunk_agbno +
ef325959
DW
2534 ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
2535 M_IGEO(mp)->blocks_per_cluster);
1da177e4
LT
2536 offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2537 XFS_INO_TO_OFFSET(mp, ino);
94e1b69d 2538
7b13c515 2539 imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
ef325959 2540 imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
755c7bf5 2541 imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
94e1b69d
CH
2542
2543 /*
2544 * If the inode number maps to a block outside the bounds
2545 * of the file system then return NULL rather than calling
2546 * read_buf and panicing when we get an error from the
2547 * driver.
2548 */
2549 if ((imap->im_blkno + imap->im_len) >
2550 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
53487786
DC
2551 xfs_alert(mp,
2552 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2553 __func__, (unsigned long long) imap->im_blkno,
94e1b69d
CH
2554 (unsigned long long) imap->im_len,
2555 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
498f0adb 2556 return -EINVAL;
94e1b69d 2557 }
498f0adb 2558 return 0;
1da177e4
LT
2559}
2560
1da177e4 2561/*
aafc3c24
BF
2562 * Log specified fields for the ag hdr (inode section). The growth of the agi
2563 * structure over time requires that we interpret the buffer as two logical
2564 * regions delineated by the end of the unlinked list. This is due to the size
2565 * of the hash table and its location in the middle of the agi.
2566 *
2567 * For example, a request to log a field before agi_unlinked and a field after
2568 * agi_unlinked could cause us to log the entire hash table and use an excessive
2569 * amount of log space. To avoid this behavior, log the region up through
2570 * agi_unlinked in one call and the region after agi_unlinked through the end of
2571 * the structure in another.
1da177e4
LT
2572 */
2573void
2574xfs_ialloc_log_agi(
0d1b9769
DC
2575 struct xfs_trans *tp,
2576 struct xfs_buf *bp,
2577 uint32_t fields)
1da177e4
LT
2578{
2579 int first; /* first byte number */
2580 int last; /* last byte number */
2581 static const short offsets[] = { /* field starting offsets */
2582 /* keep in sync with bit definitions */
2583 offsetof(xfs_agi_t, agi_magicnum),
2584 offsetof(xfs_agi_t, agi_versionnum),
2585 offsetof(xfs_agi_t, agi_seqno),
2586 offsetof(xfs_agi_t, agi_length),
2587 offsetof(xfs_agi_t, agi_count),
2588 offsetof(xfs_agi_t, agi_root),
2589 offsetof(xfs_agi_t, agi_level),
2590 offsetof(xfs_agi_t, agi_freecount),
2591 offsetof(xfs_agi_t, agi_newino),
2592 offsetof(xfs_agi_t, agi_dirino),
2593 offsetof(xfs_agi_t, agi_unlinked),
aafc3c24
BF
2594 offsetof(xfs_agi_t, agi_free_root),
2595 offsetof(xfs_agi_t, agi_free_level),
2a39946c 2596 offsetof(xfs_agi_t, agi_iblocks),
1da177e4
LT
2597 sizeof(xfs_agi_t)
2598 };
2599#ifdef DEBUG
370c782b 2600 struct xfs_agi *agi = bp->b_addr;
1da177e4 2601
69ef921b 2602 ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1da177e4 2603#endif
aafc3c24 2604
1da177e4 2605 /*
aafc3c24
BF
2606 * Compute byte offsets for the first and last fields in the first
2607 * region and log the agi buffer. This only logs up through
2608 * agi_unlinked.
1da177e4 2609 */
aafc3c24
BF
2610 if (fields & XFS_AGI_ALL_BITS_R1) {
2611 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2612 &first, &last);
2613 xfs_trans_log_buf(tp, bp, first, last);
2614 }
2615
1da177e4 2616 /*
aafc3c24
BF
2617 * Mask off the bits in the first region and calculate the first and
2618 * last field offsets for any bits in the second region.
1da177e4 2619 */
aafc3c24
BF
2620 fields &= ~XFS_AGI_ALL_BITS_R1;
2621 if (fields) {
2622 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2623 &first, &last);
2624 xfs_trans_log_buf(tp, bp, first, last);
2625 }
1da177e4
LT
2626}
2627
a6a781a5 2628static xfs_failaddr_t
612cfbfe 2629xfs_agi_verify(
2d7d1e7e 2630 struct xfs_buf *bp)
3702ce6e 2631{
2d7d1e7e
DW
2632 struct xfs_mount *mp = bp->b_mount;
2633 struct xfs_agi *agi = bp->b_addr;
2634 xfs_failaddr_t fa;
2635 uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno);
2636 uint32_t agi_length = be32_to_cpu(agi->agi_length);
2637 int i;
3702ce6e 2638
38c26bfd 2639 if (xfs_has_crc(mp)) {
a45086e2 2640 if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
a6a781a5 2641 return __this_address;
370c782b 2642 if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
a6a781a5 2643 return __this_address;
a45086e2
BF
2644 }
2645
3702ce6e
DC
2646 /*
2647 * Validate the magic number of the agi block.
2648 */
39708c20 2649 if (!xfs_verify_magic(bp, agi->agi_magicnum))
a6a781a5 2650 return __this_address;
983d09ff 2651 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
a6a781a5 2652 return __this_address;
3702ce6e 2653
2d7d1e7e
DW
2654 fa = xfs_validate_ag_length(bp, agi_seqno, agi_length);
2655 if (fa)
2656 return fa;
2657
d2a047f3 2658 if (be32_to_cpu(agi->agi_level) < 1 ||
973975b7 2659 be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
a6a781a5 2660 return __this_address;
d2a047f3 2661
38c26bfd 2662 if (xfs_has_finobt(mp) &&
d2a047f3 2663 (be32_to_cpu(agi->agi_free_level) < 1 ||
973975b7 2664 be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
a6a781a5 2665 return __this_address;
d2a047f3 2666
9f96cc95 2667 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
5089eaff 2668 if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
9f96cc95
DC
2669 continue;
2670 if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
2671 return __this_address;
2672 }
2673
a6a781a5 2674 return NULL;
612cfbfe
DC
2675}
2676
1813dd64
DC
2677static void
2678xfs_agi_read_verify(
612cfbfe
DC
2679 struct xfs_buf *bp)
2680{
dbd329f1 2681 struct xfs_mount *mp = bp->b_mount;
bc1a09b8 2682 xfs_failaddr_t fa;
983d09ff 2683
38c26bfd 2684 if (xfs_has_crc(mp) &&
ce5028cf 2685 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
bc1a09b8
DW
2686 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2687 else {
2688 fa = xfs_agi_verify(bp);
2689 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
2690 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2691 }
612cfbfe
DC
2692}
2693
b0f539de 2694static void
1813dd64 2695xfs_agi_write_verify(
612cfbfe
DC
2696 struct xfs_buf *bp)
2697{
dbd329f1 2698 struct xfs_mount *mp = bp->b_mount;
fb1755a6 2699 struct xfs_buf_log_item *bip = bp->b_log_item;
370c782b 2700 struct xfs_agi *agi = bp->b_addr;
bc1a09b8 2701 xfs_failaddr_t fa;
983d09ff 2702
bc1a09b8
DW
2703 fa = xfs_agi_verify(bp);
2704 if (fa) {
2705 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
983d09ff
DC
2706 return;
2707 }
2708
38c26bfd 2709 if (!xfs_has_crc(mp))
983d09ff
DC
2710 return;
2711
2712 if (bip)
370c782b 2713 agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
f1dbcd7e 2714 xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
3702ce6e
DC
2715}
2716
1813dd64 2717const struct xfs_buf_ops xfs_agi_buf_ops = {
233135b7 2718 .name = "xfs_agi",
39708c20 2719 .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
1813dd64
DC
2720 .verify_read = xfs_agi_read_verify,
2721 .verify_write = xfs_agi_write_verify,
b5572597 2722 .verify_struct = xfs_agi_verify,
1813dd64
DC
2723};
2724
1da177e4
LT
2725/*
2726 * Read in the allocation group header (inode allocation section)
2727 */
2728int
5e1be0fb 2729xfs_read_agi(
61021deb
DC
2730 struct xfs_perag *pag,
2731 struct xfs_trans *tp,
549d3c9a 2732 xfs_buf_flags_t flags,
61021deb 2733 struct xfs_buf **agibpp)
1da177e4 2734{
61021deb 2735 struct xfs_mount *mp = pag->pag_mount;
5e1be0fb 2736 int error;
1da177e4 2737
61021deb 2738 trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
5e1be0fb
CH
2739
2740 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
61021deb 2741 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
549d3c9a 2742 XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
de6077ec
DW
2743 if (xfs_metadata_is_sick(error))
2744 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
1da177e4
LT
2745 if (error)
2746 return error;
200237d6 2747 if (tp)
61021deb 2748 xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
5e1be0fb 2749
61021deb 2750 xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
5e1be0fb
CH
2751 return 0;
2752}
2753
a95fee40
DC
2754/*
2755 * Read in the agi and initialise the per-ag data. If the caller supplies a
2756 * @agibpp, return the locked AGI buffer to them, otherwise release it.
2757 */
5e1be0fb
CH
2758int
2759xfs_ialloc_read_agi(
99b13c7f
DC
2760 struct xfs_perag *pag,
2761 struct xfs_trans *tp,
549d3c9a 2762 int flags,
a95fee40 2763 struct xfs_buf **agibpp)
5e1be0fb 2764{
a95fee40 2765 struct xfs_buf *agibp;
99b13c7f 2766 struct xfs_agi *agi;
5e1be0fb
CH
2767 int error;
2768
99b13c7f 2769 trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
d123031a 2770
549d3c9a
DW
2771 error = xfs_read_agi(pag, tp,
2772 (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2773 &agibp);
5e1be0fb
CH
2774 if (error)
2775 return error;
2776
a95fee40 2777 agi = agibp->b_addr;
7ac2ff8b 2778 if (!xfs_perag_initialised_agi(pag)) {
16259e7d 2779 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
92821e2b 2780 pag->pagi_count = be32_to_cpu(agi->agi_count);
7ac2ff8b 2781 set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
1da177e4 2782 }
1da177e4 2783
5e1be0fb
CH
2784 /*
2785 * It's possible for these to be out of sync if
2786 * we are in the middle of a forced shutdown.
2787 */
2788 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
99b13c7f 2789 xfs_is_shutdown(pag->pag_mount));
a95fee40
DC
2790 if (agibpp)
2791 *agibpp = agibp;
2792 else
2793 xfs_trans_brelse(tp, agibp);
92821e2b
DC
2794 return 0;
2795}
91fb9afc 2796
efc0845f
DW
2797/* How many inodes are backed by inode clusters ondisk? */
2798STATIC int
2799xfs_ialloc_count_ondisk(
2800 struct xfs_btree_cur *cur,
2801 xfs_agino_t low,
2802 xfs_agino_t high,
2803 unsigned int *allocated)
2e001266
DW
2804{
2805 struct xfs_inobt_rec_incore irec;
efc0845f
DW
2806 unsigned int ret = 0;
2807 int has_record;
2808 int error;
2e001266 2809
2e001266 2810 error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
efc0845f
DW
2811 if (error)
2812 return error;
2813
2814 while (has_record) {
2815 unsigned int i, hole_idx;
2816
2e001266 2817 error = xfs_inobt_get_rec(cur, &irec, &has_record);
efc0845f
DW
2818 if (error)
2819 return error;
2820 if (irec.ir_startino > high)
2e001266
DW
2821 break;
2822
efc0845f
DW
2823 for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
2824 if (irec.ir_startino + i < low)
2e001266 2825 continue;
efc0845f
DW
2826 if (irec.ir_startino + i > high)
2827 break;
2828
2829 hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT;
2830 if (!(irec.ir_holemask & (1U << hole_idx)))
2831 ret++;
2e001266
DW
2832 }
2833
2834 error = xfs_btree_increment(cur, 0, &has_record);
efc0845f
DW
2835 if (error)
2836 return error;
2e001266 2837 }
efc0845f
DW
2838
2839 *allocated = ret;
2840 return 0;
2e001266
DW
2841}
2842
2843/* Is there an inode record covering a given extent? */
2844int
2845xfs_ialloc_has_inodes_at_extent(
2846 struct xfs_btree_cur *cur,
2847 xfs_agblock_t bno,
2848 xfs_extlen_t len,
efc0845f 2849 enum xbtree_recpacking *outcome)
2e001266 2850{
efc0845f
DW
2851 xfs_agino_t agino;
2852 xfs_agino_t last_agino;
2853 unsigned int allocated;
2854 int error;
2e001266 2855
efc0845f
DW
2856 agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
2857 last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
2e001266 2858
efc0845f
DW
2859 error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated);
2860 if (error)
2861 return error;
2862
2863 if (allocated == 0)
2864 *outcome = XBTREE_RECPACKING_EMPTY;
2865 else if (allocated == last_agino - agino + 1)
2866 *outcome = XBTREE_RECPACKING_FULL;
2867 else
2868 *outcome = XBTREE_RECPACKING_SPARSE;
2869 return 0;
2e001266
DW
2870}
2871
2872struct xfs_ialloc_count_inodes {
2873 xfs_agino_t count;
2874 xfs_agino_t freecount;
2875};
2876
2877/* Record inode counts across all inobt records. */
2878STATIC int
2879xfs_ialloc_count_inodes_rec(
2880 struct xfs_btree_cur *cur,
159eb69d 2881 const union xfs_btree_rec *rec,
2e001266
DW
2882 void *priv)
2883{
2884 struct xfs_inobt_rec_incore irec;
2885 struct xfs_ialloc_count_inodes *ci = priv;
ee12eaaa 2886 xfs_failaddr_t fa;
2e001266
DW
2887
2888 xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
dbfbf3bd 2889 fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
ee12eaaa
DW
2890 if (fa)
2891 return xfs_inobt_complain_bad_rec(cur, fa, &irec);
366a0b8d 2892
2e001266
DW
2893 ci->count += irec.ir_count;
2894 ci->freecount += irec.ir_freecount;
2895
2896 return 0;
2897}
2898
2899/* Count allocated and free inodes under an inobt. */
2900int
2901xfs_ialloc_count_inodes(
2902 struct xfs_btree_cur *cur,
2903 xfs_agino_t *count,
2904 xfs_agino_t *freecount)
2905{
2906 struct xfs_ialloc_count_inodes ci = {0};
2907 int error;
2908
ec793e69 2909 ASSERT(xfs_btree_is_ino(cur->bc_ops));
2e001266
DW
2910 error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
2911 if (error)
2912 return error;
2913
2914 *count = ci.count;
2915 *freecount = ci.freecount;
2916 return 0;
2917}
494dba7b
DW
2918
2919/*
2920 * Initialize inode-related geometry information.
2921 *
2922 * Compute the inode btree min and max levels and set maxicount.
2923 *
2924 * Set the inode cluster size. This may still be overridden by the file
2925 * system block size if it is larger than the chosen cluster size.
2926 *
2927 * For v5 filesystems, scale the cluster size with the inode size to keep a
2928 * constant ratio of inode per cluster buffer, but only if mkfs has set the
2929 * inode alignment value appropriately for larger cluster sizes.
2930 *
2931 * Then compute the inode cluster alignment information.
2932 */
2933void
2934xfs_ialloc_setup_geometry(
2935 struct xfs_mount *mp)
2936{
2937 struct xfs_sb *sbp = &mp->m_sb;
2938 struct xfs_ino_geometry *igeo = M_IGEO(mp);
2939 uint64_t icount;
2940 uint inodes;
2941
f93e5436 2942 igeo->new_diflags2 = 0;
ebd9027d 2943 if (xfs_has_bigtime(mp))
f93e5436 2944 igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
9b7d16e3
CB
2945 if (xfs_has_large_extent_counts(mp))
2946 igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
f93e5436 2947
494dba7b
DW
2948 /* Compute inode btree geometry. */
2949 igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
2950 igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
2951 igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
2952 igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
2953 igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
2954
2955 igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
2956 sbp->sb_inopblock);
2957 igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
2958
2959 if (sbp->sb_spino_align)
2960 igeo->ialloc_min_blks = sbp->sb_spino_align;
2961 else
2962 igeo->ialloc_min_blks = igeo->ialloc_blks;
2963
2964 /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2965 inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2966 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2967 inodes);
0ed5f735 2968 ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
494dba7b 2969
c94613fe
DW
2970 /*
2971 * Set the maximum inode count for this filesystem, being careful not
2972 * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
2973 * users should never get here due to failing sb verification, but
2974 * certain users (xfs_db) need to be usable even with corrupt metadata.
2975 */
2976 if (sbp->sb_imax_pct && igeo->ialloc_blks) {
494dba7b
DW
2977 /*
2978 * Make sure the maximum inode count is a multiple
2979 * of the units we allocate inodes in.
2980 */
2981 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
2982 do_div(icount, 100);
2983 do_div(icount, igeo->ialloc_blks);
2984 igeo->maxicount = XFS_FSB_TO_INO(mp,
2985 icount * igeo->ialloc_blks);
2986 } else {
2987 igeo->maxicount = 0;
2988 }
2989
490d451f
DW
2990 /*
2991 * Compute the desired size of an inode cluster buffer size, which
2992 * starts at 8K and (on v5 filesystems) scales up with larger inode
2993 * sizes.
2994 *
2995 * Preserve the desired inode cluster size because the sparse inodes
2996 * feature uses that desired size (not the actual size) to compute the
2997 * sparse inode alignment. The mount code validates this value, so we
2998 * cannot change the behavior.
2999 */
3000 igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
ebd9027d 3001 if (xfs_has_v3inodes(mp)) {
490d451f 3002 int new_size = igeo->inode_cluster_size_raw;
494dba7b
DW
3003
3004 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
3005 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
490d451f 3006 igeo->inode_cluster_size_raw = new_size;
494dba7b
DW
3007 }
3008
3009 /* Calculate inode cluster ratios. */
490d451f 3010 if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
494dba7b 3011 igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
490d451f 3012 igeo->inode_cluster_size_raw);
494dba7b
DW
3013 else
3014 igeo->blocks_per_cluster = 1;
490d451f 3015 igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
494dba7b
DW
3016 igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
3017
3018 /* Calculate inode cluster alignment. */
ebd9027d 3019 if (xfs_has_align(mp) &&
494dba7b
DW
3020 mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
3021 igeo->cluster_align = mp->m_sb.sb_inoalignmt;
3022 else
3023 igeo->cluster_align = 1;
3024 igeo->inoalign_mask = igeo->cluster_align - 1;
3025 igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
3026
3027 /*
3028 * If we are using stripe alignment, check whether
3029 * the stripe unit is a multiple of the inode alignment
3030 */
3031 if (mp->m_dalign && igeo->inoalign_mask &&
3032 !(mp->m_dalign & igeo->inoalign_mask))
3033 igeo->ialloc_align = mp->m_dalign;
3034 else
3035 igeo->ialloc_align = 0;
3036}
13eaec4b
DW
3037
3038/* Compute the location of the root directory inode that is laid out by mkfs. */
3039xfs_ino_t
3040xfs_ialloc_calc_rootino(
3041 struct xfs_mount *mp,
3042 int sunit)
3043{
3044 struct xfs_ino_geometry *igeo = M_IGEO(mp);
3045 xfs_agblock_t first_bno;
3046
3047 /*
3048 * Pre-calculate the geometry of AG 0. We know what it looks like
3049 * because libxfs knows how to create allocation groups now.
3050 *
3051 * first_bno is the first block in which mkfs could possibly have
3052 * allocated the root directory inode, once we factor in the metadata
3053 * that mkfs formats before it. Namely, the four AG headers...
3054 */
3055 first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
3056
3057 /* ...the two free space btree roots... */
3058 first_bno += 2;
3059
3060 /* ...the inode btree root... */
3061 first_bno += 1;
3062
3063 /* ...the initial AGFL... */
3064 first_bno += xfs_alloc_min_freelist(mp, NULL);
3065
3066 /* ...the free inode btree root... */
ebd9027d 3067 if (xfs_has_finobt(mp))
13eaec4b
DW
3068 first_bno++;
3069
3070 /* ...the reverse mapping btree root... */
ebd9027d 3071 if (xfs_has_rmapbt(mp))
13eaec4b
DW
3072 first_bno++;
3073
3074 /* ...the reference count btree... */
ebd9027d 3075 if (xfs_has_reflink(mp))
13eaec4b
DW
3076 first_bno++;
3077
3078 /*
3079 * ...and the log, if it is allocated in the first allocation group.
3080 *
3081 * This can happen with filesystems that only have a single
3082 * allocation group, or very odd geometries created by old mkfs
3083 * versions on very small filesystems.
3084 */
36029dee 3085 if (xfs_ag_contains_log(mp, 0))
13eaec4b
DW
3086 first_bno += mp->m_sb.sb_logblocks;
3087
3088 /*
3089 * Now round first_bno up to whatever allocation alignment is given
3090 * by the filesystem or was passed in.
3091 */
ebd9027d 3092 if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
13eaec4b 3093 first_bno = roundup(first_bno, sunit);
ebd9027d 3094 else if (xfs_has_align(mp) &&
13eaec4b
DW
3095 mp->m_sb.sb_inoalignmt > 1)
3096 first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
3097
3098 return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
3099}
da062d16
DW
3100
3101/*
3102 * Ensure there are not sparse inode clusters that cross the new EOAG.
3103 *
3104 * This is a no-op for non-spinode filesystems since clusters are always fully
3105 * allocated and checking the bnobt suffices. However, a spinode filesystem
3106 * could have a record where the upper inodes are free blocks. If those blocks
3107 * were removed from the filesystem, the inode record would extend beyond EOAG,
3108 * which will be flagged as corruption.
3109 */
3110int
3111xfs_ialloc_check_shrink(
dedab3e4 3112 struct xfs_perag *pag,
da062d16 3113 struct xfs_trans *tp,
da062d16
DW
3114 struct xfs_buf *agibp,
3115 xfs_agblock_t new_length)
3116{
3117 struct xfs_inobt_rec_incore rec;
3118 struct xfs_btree_cur *cur;
bab8b795 3119 xfs_agino_t agino;
da062d16
DW
3120 int has;
3121 int error;
3122
bab8b795 3123 if (!xfs_has_sparseinodes(pag->pag_mount))
da062d16
DW
3124 return 0;
3125
14dd46cf 3126 cur = xfs_inobt_init_cursor(pag, tp, agibp);
da062d16
DW
3127
3128 /* Look up the inobt record that would correspond to the new EOFS. */
bab8b795 3129 agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
da062d16
DW
3130 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
3131 if (error || !has)
3132 goto out;
3133
3134 error = xfs_inobt_get_rec(cur, &rec, &has);
3135 if (error)
3136 goto out;
3137
3138 if (!has) {
baf44fa5 3139 xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
da062d16
DW
3140 error = -EFSCORRUPTED;
3141 goto out;
3142 }
3143
3144 /* If the record covers inodes that would be beyond EOFS, bail out. */
3145 if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
3146 error = -ENOSPC;
3147 goto out;
3148 }
3149out:
3150 xfs_btree_del_cursor(cur, error);
da062d16
DW
3151 return error;
3152}
This page took 1.979264 seconds and 4 git commands to generate.