]> Git Repo - linux.git/blob - fs/ext4/fast_commit.c
ext4: inline data inode fast commit replay fixes
[linux.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <[email protected]>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligibility is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * Fast Commit Replay Idempotence
107  * ------------------------------
108  *
109  * Fast commits tags are idempotent in nature provided the recovery code follows
110  * certain rules. The guiding principle that the commit path follows while
111  * committing is that it stores the result of a particular operation instead of
112  * storing the procedure.
113  *
114  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115  * was associated with inode 10. During fast commit, instead of storing this
116  * operation as a procedure "rename a to b", we store the resulting file system
117  * state as a "series" of outcomes:
118  *
119  * - Link dirent b to inode 10
120  * - Unlink dirent a
121  * - Inode <10> with valid refcount
122  *
123  * Now when recovery code runs, it needs "enforce" this state on the file
124  * system. This is what guarantees idempotence of fast commit replay.
125  *
126  * Let's take an example of a procedure that is not idempotent and see how fast
127  * commits make it idempotent. Consider following sequence of operations:
128  *
129  *     rm A;    mv B A;    read A
130  *  (x)     (y)        (z)
131  *
132  * (x), (y) and (z) are the points at which we can crash. If we store this
133  * sequence of operations as is then the replay is not idempotent. Let's say
134  * while in replay, we crash at (z). During the second replay, file A (which was
135  * actually created as a result of "mv B A" operation) would get deleted. Thus,
136  * file named A would be absent when we try to read A. So, this sequence of
137  * operations is not idempotent. However, as mentioned above, instead of storing
138  * the procedure fast commits store the outcome of each procedure. Thus the fast
139  * commit log for above procedure would be as follows:
140  *
141  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142  * inode 11 before the replay)
143  *
144  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
145  * (w)          (x)                    (y)          (z)
146  *
147  * If we crash at (z), we will have file A linked to inode 11. During the second
148  * replay, we will remove file A (inode 11). But we will create it back and make
149  * it point to inode 11. We won't find B, so we'll just skip that step. At this
150  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152  * similarly. Thus, by converting a non-idempotent procedure into a series of
153  * idempotent outcomes, fast commits ensured idempotence during the replay.
154  *
155  * TODOs
156  * -----
157  *
158  * 0) Fast commit replay path hardening: Fast commit replay code should use
159  *    journal handles to make sure all the updates it does during the replay
160  *    path are atomic. With that if we crash during fast commit replay, after
161  *    trying to do recovery again, we will find a file system where fast commit
162  *    area is invalid (because new full commit would be found). In order to deal
163  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
164  *    superblock state is persisted before starting the replay, so that after
165  *    the crash, fast commit recovery code can look at that flag and perform
166  *    fast commit recovery even if that area is invalidated by later full
167  *    commits.
168  *
169  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170  *    eligible update must be protected within ext4_fc_start_update() and
171  *    ext4_fc_stop_update(). These routines are called at much higher
172  *    routines. This can be made more fine grained by combining with
173  *    ext4_journal_start().
174  *
175  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176  *
177  * 3) Handle more ineligible cases.
178  */
179
180 #include <trace/events/ext4.h>
181 static struct kmem_cache *ext4_fc_dentry_cachep;
182
183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184 {
185         BUFFER_TRACE(bh, "");
186         if (uptodate) {
187                 ext4_debug("%s: Block %lld up-to-date",
188                            __func__, bh->b_blocknr);
189                 set_buffer_uptodate(bh);
190         } else {
191                 ext4_debug("%s: Block %lld not up-to-date",
192                            __func__, bh->b_blocknr);
193                 clear_buffer_uptodate(bh);
194         }
195
196         unlock_buffer(bh);
197 }
198
199 static inline void ext4_fc_reset_inode(struct inode *inode)
200 {
201         struct ext4_inode_info *ei = EXT4_I(inode);
202
203         ei->i_fc_lblk_start = 0;
204         ei->i_fc_lblk_len = 0;
205 }
206
207 void ext4_fc_init_inode(struct inode *inode)
208 {
209         struct ext4_inode_info *ei = EXT4_I(inode);
210
211         ext4_fc_reset_inode(inode);
212         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213         INIT_LIST_HEAD(&ei->i_fc_list);
214         init_waitqueue_head(&ei->i_fc_wait);
215         atomic_set(&ei->i_fc_updates, 0);
216 }
217
218 /* This function must be called with sbi->s_fc_lock held. */
219 static void ext4_fc_wait_committing_inode(struct inode *inode)
220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221 {
222         wait_queue_head_t *wq;
223         struct ext4_inode_info *ei = EXT4_I(inode);
224
225 #if (BITS_PER_LONG < 64)
226         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227                         EXT4_STATE_FC_COMMITTING);
228         wq = bit_waitqueue(&ei->i_state_flags,
229                                 EXT4_STATE_FC_COMMITTING);
230 #else
231         DEFINE_WAIT_BIT(wait, &ei->i_flags,
232                         EXT4_STATE_FC_COMMITTING);
233         wq = bit_waitqueue(&ei->i_flags,
234                                 EXT4_STATE_FC_COMMITTING);
235 #endif
236         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239         schedule();
240         finish_wait(wq, &wait.wq_entry);
241 }
242
243 /*
244  * Inform Ext4's fast about start of an inode update
245  *
246  * This function is called by the high level call VFS callbacks before
247  * performing any inode update. This function blocks if there's an ongoing
248  * fast commit on the inode in question.
249  */
250 void ext4_fc_start_update(struct inode *inode)
251 {
252         struct ext4_inode_info *ei = EXT4_I(inode);
253
254         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256                 return;
257
258 restart:
259         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260         if (list_empty(&ei->i_fc_list))
261                 goto out;
262
263         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264                 ext4_fc_wait_committing_inode(inode);
265                 goto restart;
266         }
267 out:
268         atomic_inc(&ei->i_fc_updates);
269         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270 }
271
272 /*
273  * Stop inode update and wake up waiting fast commits if any.
274  */
275 void ext4_fc_stop_update(struct inode *inode)
276 {
277         struct ext4_inode_info *ei = EXT4_I(inode);
278
279         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281                 return;
282
283         if (atomic_dec_and_test(&ei->i_fc_updates))
284                 wake_up_all(&ei->i_fc_wait);
285 }
286
287 /*
288  * Remove inode from fast commit list. If the inode is being committed
289  * we wait until inode commit is done.
290  */
291 void ext4_fc_del(struct inode *inode)
292 {
293         struct ext4_inode_info *ei = EXT4_I(inode);
294
295         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297                 return;
298
299 restart:
300         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301         if (list_empty(&ei->i_fc_list)) {
302                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303                 return;
304         }
305
306         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307                 ext4_fc_wait_committing_inode(inode);
308                 goto restart;
309         }
310         list_del_init(&ei->i_fc_list);
311         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312 }
313
314 /*
315  * Mark file system as fast commit ineligible. This means that next commit
316  * operation would result in a full jbd2 commit.
317  */
318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319 {
320         struct ext4_sb_info *sbi = EXT4_SB(sb);
321
322         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324                 return;
325
326         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327         WARN_ON(reason >= EXT4_FC_REASON_MAX);
328         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329 }
330
331 /*
332  * Start a fast commit ineligible update. Any commits that happen while
333  * such an operation is in progress fall back to full commits.
334  */
335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336 {
337         struct ext4_sb_info *sbi = EXT4_SB(sb);
338
339         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341                 return;
342
343         WARN_ON(reason >= EXT4_FC_REASON_MAX);
344         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345         atomic_inc(&sbi->s_fc_ineligible_updates);
346 }
347
348 /*
349  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350  * to ensure that after stopping the ineligible update, at least one full
351  * commit takes place.
352  */
353 void ext4_fc_stop_ineligible(struct super_block *sb)
354 {
355         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357                 return;
358
359         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361 }
362
363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
364 {
365         return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367 }
368
369 /*
370  * Generic fast commit tracking function. If this is the first time this we are
371  * called after a full commit, we initialize fast commit fields and then call
372  * __fc_track_fn() with update = 0. If we have already been called after a full
373  * commit, we pass update = 1. Based on that, the track function can determine
374  * if it needs to track a field for the first time or if it needs to just
375  * update the previously tracked value.
376  *
377  * If enqueue is set, this function enqueues the inode in fast commit list.
378  */
379 static int ext4_fc_track_template(
380         handle_t *handle, struct inode *inode,
381         int (*__fc_track_fn)(struct inode *, void *, bool),
382         void *args, int enqueue)
383 {
384         bool update = false;
385         struct ext4_inode_info *ei = EXT4_I(inode);
386         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387         tid_t tid = 0;
388         int ret;
389
390         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391             (sbi->s_mount_state & EXT4_FC_REPLAY))
392                 return -EOPNOTSUPP;
393
394         if (ext4_fc_is_ineligible(inode->i_sb))
395                 return -EINVAL;
396
397         tid = handle->h_transaction->t_tid;
398         mutex_lock(&ei->i_fc_lock);
399         if (tid == ei->i_sync_tid) {
400                 update = true;
401         } else {
402                 ext4_fc_reset_inode(inode);
403                 ei->i_sync_tid = tid;
404         }
405         ret = __fc_track_fn(inode, args, update);
406         mutex_unlock(&ei->i_fc_lock);
407
408         if (!enqueue)
409                 return ret;
410
411         spin_lock(&sbi->s_fc_lock);
412         if (list_empty(&EXT4_I(inode)->i_fc_list))
413                 list_add_tail(&EXT4_I(inode)->i_fc_list,
414                                 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415                                 &sbi->s_fc_q[FC_Q_STAGING] :
416                                 &sbi->s_fc_q[FC_Q_MAIN]);
417         spin_unlock(&sbi->s_fc_lock);
418
419         return ret;
420 }
421
422 struct __track_dentry_update_args {
423         struct dentry *dentry;
424         int op;
425 };
426
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429 {
430         struct ext4_fc_dentry_update *node;
431         struct ext4_inode_info *ei = EXT4_I(inode);
432         struct __track_dentry_update_args *dentry_update =
433                 (struct __track_dentry_update_args *)arg;
434         struct dentry *dentry = dentry_update->dentry;
435         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436
437         mutex_unlock(&ei->i_fc_lock);
438         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439         if (!node) {
440                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441                 mutex_lock(&ei->i_fc_lock);
442                 return -ENOMEM;
443         }
444
445         node->fcd_op = dentry_update->op;
446         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447         node->fcd_ino = inode->i_ino;
448         if (dentry->d_name.len > DNAME_INLINE_LEN) {
449                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450                 if (!node->fcd_name.name) {
451                         kmem_cache_free(ext4_fc_dentry_cachep, node);
452                         ext4_fc_mark_ineligible(inode->i_sb,
453                                 EXT4_FC_REASON_NOMEM);
454                         mutex_lock(&ei->i_fc_lock);
455                         return -ENOMEM;
456                 }
457                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458                         dentry->d_name.len);
459         } else {
460                 memcpy(node->fcd_iname, dentry->d_name.name,
461                         dentry->d_name.len);
462                 node->fcd_name.name = node->fcd_iname;
463         }
464         node->fcd_name.len = dentry->d_name.len;
465
466         spin_lock(&sbi->s_fc_lock);
467         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468                 list_add_tail(&node->fcd_list,
469                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
470         else
471                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472         spin_unlock(&sbi->s_fc_lock);
473         mutex_lock(&ei->i_fc_lock);
474
475         return 0;
476 }
477
478 void __ext4_fc_track_unlink(handle_t *handle,
479                 struct inode *inode, struct dentry *dentry)
480 {
481         struct __track_dentry_update_args args;
482         int ret;
483
484         args.dentry = dentry;
485         args.op = EXT4_FC_TAG_UNLINK;
486
487         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488                                         (void *)&args, 0);
489         trace_ext4_fc_track_unlink(inode, dentry, ret);
490 }
491
492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493 {
494         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495 }
496
497 void __ext4_fc_track_link(handle_t *handle,
498         struct inode *inode, struct dentry *dentry)
499 {
500         struct __track_dentry_update_args args;
501         int ret;
502
503         args.dentry = dentry;
504         args.op = EXT4_FC_TAG_LINK;
505
506         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507                                         (void *)&args, 0);
508         trace_ext4_fc_track_link(inode, dentry, ret);
509 }
510
511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512 {
513         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
514 }
515
516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
517                           struct dentry *dentry)
518 {
519         struct __track_dentry_update_args args;
520         int ret;
521
522         args.dentry = dentry;
523         args.op = EXT4_FC_TAG_CREAT;
524
525         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526                                         (void *)&args, 0);
527         trace_ext4_fc_track_create(inode, dentry, ret);
528 }
529
530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
531 {
532         __ext4_fc_track_create(handle, d_inode(dentry), dentry);
533 }
534
535 /* __track_fn for inode tracking */
536 static int __track_inode(struct inode *inode, void *arg, bool update)
537 {
538         if (update)
539                 return -EEXIST;
540
541         EXT4_I(inode)->i_fc_lblk_len = 0;
542
543         return 0;
544 }
545
546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
547 {
548         int ret;
549
550         if (S_ISDIR(inode->i_mode))
551                 return;
552
553         if (ext4_should_journal_data(inode)) {
554                 ext4_fc_mark_ineligible(inode->i_sb,
555                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
556                 return;
557         }
558
559         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
560         trace_ext4_fc_track_inode(inode, ret);
561 }
562
563 struct __track_range_args {
564         ext4_lblk_t start, end;
565 };
566
567 /* __track_fn for tracking data updates */
568 static int __track_range(struct inode *inode, void *arg, bool update)
569 {
570         struct ext4_inode_info *ei = EXT4_I(inode);
571         ext4_lblk_t oldstart;
572         struct __track_range_args *__arg =
573                 (struct __track_range_args *)arg;
574
575         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
576                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
577                 return -ECANCELED;
578         }
579
580         oldstart = ei->i_fc_lblk_start;
581
582         if (update && ei->i_fc_lblk_len > 0) {
583                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
584                 ei->i_fc_lblk_len =
585                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
586                                 ei->i_fc_lblk_start + 1;
587         } else {
588                 ei->i_fc_lblk_start = __arg->start;
589                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
590         }
591
592         return 0;
593 }
594
595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
596                          ext4_lblk_t end)
597 {
598         struct __track_range_args args;
599         int ret;
600
601         if (S_ISDIR(inode->i_mode))
602                 return;
603
604         args.start = start;
605         args.end = end;
606
607         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
608
609         trace_ext4_fc_track_range(inode, start, end, ret);
610 }
611
612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
613 {
614         int write_flags = REQ_SYNC;
615         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
616
617         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
618         if (test_opt(sb, BARRIER) && is_tail)
619                 write_flags |= REQ_FUA | REQ_PREFLUSH;
620         lock_buffer(bh);
621         set_buffer_dirty(bh);
622         set_buffer_uptodate(bh);
623         bh->b_end_io = ext4_end_buffer_io_sync;
624         submit_bh(REQ_OP_WRITE, write_flags, bh);
625         EXT4_SB(sb)->s_fc_bh = NULL;
626 }
627
628 /* Ext4 commit path routines */
629
630 /* memzero and update CRC */
631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
632                                 u32 *crc)
633 {
634         void *ret;
635
636         ret = memset(dst, 0, len);
637         if (crc)
638                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
639         return ret;
640 }
641
642 /*
643  * Allocate len bytes on a fast commit buffer.
644  *
645  * During the commit time this function is used to manage fast commit
646  * block space. We don't split a fast commit log onto different
647  * blocks. So this function makes sure that if there's not enough space
648  * on the current block, the remaining space in the current block is
649  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
650  * new block is from jbd2 and CRC is updated to reflect the padding
651  * we added.
652  */
653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
654 {
655         struct ext4_fc_tl *tl;
656         struct ext4_sb_info *sbi = EXT4_SB(sb);
657         struct buffer_head *bh;
658         int bsize = sbi->s_journal->j_blocksize;
659         int ret, off = sbi->s_fc_bytes % bsize;
660         int pad_len;
661
662         /*
663          * After allocating len, we should have space at least for a 0 byte
664          * padding.
665          */
666         if (len + sizeof(struct ext4_fc_tl) > bsize)
667                 return NULL;
668
669         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
670                 /*
671                  * Only allocate from current buffer if we have enough space for
672                  * this request AND we have space to add a zero byte padding.
673                  */
674                 if (!sbi->s_fc_bh) {
675                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
676                         if (ret)
677                                 return NULL;
678                         sbi->s_fc_bh = bh;
679                 }
680                 sbi->s_fc_bytes += len;
681                 return sbi->s_fc_bh->b_data + off;
682         }
683         /* Need to add PAD tag */
684         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
685         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
686         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
687         tl->fc_len = cpu_to_le16(pad_len);
688         if (crc)
689                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
690         if (pad_len > 0)
691                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
692         ext4_fc_submit_bh(sb, false);
693
694         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
695         if (ret)
696                 return NULL;
697         sbi->s_fc_bh = bh;
698         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
699         return sbi->s_fc_bh->b_data;
700 }
701
702 /* memcpy to fc reserved space and update CRC */
703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
704                                 int len, u32 *crc)
705 {
706         if (crc)
707                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
708         return memcpy(dst, src, len);
709 }
710
711 /*
712  * Complete a fast commit by writing tail tag.
713  *
714  * Writing tail tag marks the end of a fast commit. In order to guarantee
715  * atomicity, after writing tail tag, even if there's space remaining
716  * in the block, next commit shouldn't use it. That's why tail tag
717  * has the length as that of the remaining space on the block.
718  */
719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
720 {
721         struct ext4_sb_info *sbi = EXT4_SB(sb);
722         struct ext4_fc_tl tl;
723         struct ext4_fc_tail tail;
724         int off, bsize = sbi->s_journal->j_blocksize;
725         u8 *dst;
726
727         /*
728          * ext4_fc_reserve_space takes care of allocating an extra block if
729          * there's no enough space on this block for accommodating this tail.
730          */
731         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
732         if (!dst)
733                 return -ENOSPC;
734
735         off = sbi->s_fc_bytes % bsize;
736
737         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
738         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
739         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
740
741         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
742         dst += sizeof(tl);
743         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
744         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
745         dst += sizeof(tail.fc_tid);
746         tail.fc_crc = cpu_to_le32(crc);
747         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
748
749         ext4_fc_submit_bh(sb, true);
750
751         return 0;
752 }
753
754 /*
755  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
756  * Returns false if there's not enough space.
757  */
758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
759                            u32 *crc)
760 {
761         struct ext4_fc_tl tl;
762         u8 *dst;
763
764         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
765         if (!dst)
766                 return false;
767
768         tl.fc_tag = cpu_to_le16(tag);
769         tl.fc_len = cpu_to_le16(len);
770
771         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
772         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
773
774         return true;
775 }
776
777 /* Same as above, but adds dentry tlv. */
778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
779                                    struct ext4_fc_dentry_update *fc_dentry)
780 {
781         struct ext4_fc_dentry_info fcd;
782         struct ext4_fc_tl tl;
783         int dlen = fc_dentry->fcd_name.len;
784         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
785                                         crc);
786
787         if (!dst)
788                 return false;
789
790         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
791         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
792         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
793         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
794         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
795         dst += sizeof(tl);
796         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
797         dst += sizeof(fcd);
798         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
799         dst += dlen;
800
801         return true;
802 }
803
804 /*
805  * Writes inode in the fast commit space under TLV with tag @tag.
806  * Returns 0 on success, error on failure.
807  */
808 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
809 {
810         struct ext4_inode_info *ei = EXT4_I(inode);
811         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
812         int ret;
813         struct ext4_iloc iloc;
814         struct ext4_fc_inode fc_inode;
815         struct ext4_fc_tl tl;
816         u8 *dst;
817
818         ret = ext4_get_inode_loc(inode, &iloc);
819         if (ret)
820                 return ret;
821
822         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
823                 inode_len = EXT4_INODE_SIZE(inode->i_sb);
824         else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
825                 inode_len += ei->i_extra_isize;
826
827         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
828         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
829         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
830
831         dst = ext4_fc_reserve_space(inode->i_sb,
832                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
833         if (!dst)
834                 return -ECANCELED;
835
836         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
837                 return -ECANCELED;
838         dst += sizeof(tl);
839         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
840                 return -ECANCELED;
841         dst += sizeof(fc_inode);
842         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
843                                         inode_len, crc))
844                 return -ECANCELED;
845
846         return 0;
847 }
848
849 /*
850  * Writes updated data ranges for the inode in question. Updates CRC.
851  * Returns 0 on success, error otherwise.
852  */
853 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
854 {
855         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
856         struct ext4_inode_info *ei = EXT4_I(inode);
857         struct ext4_map_blocks map;
858         struct ext4_fc_add_range fc_ext;
859         struct ext4_fc_del_range lrange;
860         struct ext4_extent *ex;
861         int ret;
862
863         mutex_lock(&ei->i_fc_lock);
864         if (ei->i_fc_lblk_len == 0) {
865                 mutex_unlock(&ei->i_fc_lock);
866                 return 0;
867         }
868         old_blk_size = ei->i_fc_lblk_start;
869         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
870         ei->i_fc_lblk_len = 0;
871         mutex_unlock(&ei->i_fc_lock);
872
873         cur_lblk_off = old_blk_size;
874         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
875                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
876
877         while (cur_lblk_off <= new_blk_size) {
878                 map.m_lblk = cur_lblk_off;
879                 map.m_len = new_blk_size - cur_lblk_off + 1;
880                 ret = ext4_map_blocks(NULL, inode, &map, 0);
881                 if (ret < 0)
882                         return -ECANCELED;
883
884                 if (map.m_len == 0) {
885                         cur_lblk_off++;
886                         continue;
887                 }
888
889                 if (ret == 0) {
890                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
891                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
892                         lrange.fc_len = cpu_to_le32(map.m_len);
893                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
894                                             sizeof(lrange), (u8 *)&lrange, crc))
895                                 return -ENOSPC;
896                 } else {
897                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
898                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
899
900                         /* Limit the number of blocks in one extent */
901                         map.m_len = min(max, map.m_len);
902
903                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
904                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
905                         ex->ee_block = cpu_to_le32(map.m_lblk);
906                         ex->ee_len = cpu_to_le16(map.m_len);
907                         ext4_ext_store_pblock(ex, map.m_pblk);
908                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
909                                 ext4_ext_mark_unwritten(ex);
910                         else
911                                 ext4_ext_mark_initialized(ex);
912                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
913                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
914                                 return -ENOSPC;
915                 }
916
917                 cur_lblk_off += map.m_len;
918         }
919
920         return 0;
921 }
922
923
924 /* Submit data for all the fast commit inodes */
925 static int ext4_fc_submit_inode_data_all(journal_t *journal)
926 {
927         struct super_block *sb = (struct super_block *)(journal->j_private);
928         struct ext4_sb_info *sbi = EXT4_SB(sb);
929         struct ext4_inode_info *ei;
930         int ret = 0;
931
932         spin_lock(&sbi->s_fc_lock);
933         ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
934         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
935                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
936                 while (atomic_read(&ei->i_fc_updates)) {
937                         DEFINE_WAIT(wait);
938
939                         prepare_to_wait(&ei->i_fc_wait, &wait,
940                                                 TASK_UNINTERRUPTIBLE);
941                         if (atomic_read(&ei->i_fc_updates)) {
942                                 spin_unlock(&sbi->s_fc_lock);
943                                 schedule();
944                                 spin_lock(&sbi->s_fc_lock);
945                         }
946                         finish_wait(&ei->i_fc_wait, &wait);
947                 }
948                 spin_unlock(&sbi->s_fc_lock);
949                 ret = jbd2_submit_inode_data(ei->jinode);
950                 if (ret)
951                         return ret;
952                 spin_lock(&sbi->s_fc_lock);
953         }
954         spin_unlock(&sbi->s_fc_lock);
955
956         return ret;
957 }
958
959 /* Wait for completion of data for all the fast commit inodes */
960 static int ext4_fc_wait_inode_data_all(journal_t *journal)
961 {
962         struct super_block *sb = (struct super_block *)(journal->j_private);
963         struct ext4_sb_info *sbi = EXT4_SB(sb);
964         struct ext4_inode_info *pos, *n;
965         int ret = 0;
966
967         spin_lock(&sbi->s_fc_lock);
968         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
969                 if (!ext4_test_inode_state(&pos->vfs_inode,
970                                            EXT4_STATE_FC_COMMITTING))
971                         continue;
972                 spin_unlock(&sbi->s_fc_lock);
973
974                 ret = jbd2_wait_inode_data(journal, pos->jinode);
975                 if (ret)
976                         return ret;
977                 spin_lock(&sbi->s_fc_lock);
978         }
979         spin_unlock(&sbi->s_fc_lock);
980
981         return 0;
982 }
983
984 /* Commit all the directory entry updates */
985 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
986 __acquires(&sbi->s_fc_lock)
987 __releases(&sbi->s_fc_lock)
988 {
989         struct super_block *sb = (struct super_block *)(journal->j_private);
990         struct ext4_sb_info *sbi = EXT4_SB(sb);
991         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
992         struct inode *inode;
993         struct ext4_inode_info *ei, *ei_n;
994         int ret;
995
996         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
997                 return 0;
998         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
999                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1000                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1001                         spin_unlock(&sbi->s_fc_lock);
1002                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1003                                 ret = -ENOSPC;
1004                                 goto lock_and_exit;
1005                         }
1006                         spin_lock(&sbi->s_fc_lock);
1007                         continue;
1008                 }
1009
1010                 inode = NULL;
1011                 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1012                                          i_fc_list) {
1013                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1014                                 inode = &ei->vfs_inode;
1015                                 break;
1016                         }
1017                 }
1018                 /*
1019                  * If we don't find inode in our list, then it was deleted,
1020                  * in which case, we don't need to record it's create tag.
1021                  */
1022                 if (!inode)
1023                         continue;
1024                 spin_unlock(&sbi->s_fc_lock);
1025
1026                 /*
1027                  * We first write the inode and then the create dirent. This
1028                  * allows the recovery code to create an unnamed inode first
1029                  * and then link it to a directory entry. This allows us
1030                  * to use namei.c routines almost as is and simplifies
1031                  * the recovery code.
1032                  */
1033                 ret = ext4_fc_write_inode(inode, crc);
1034                 if (ret)
1035                         goto lock_and_exit;
1036
1037                 ret = ext4_fc_write_inode_data(inode, crc);
1038                 if (ret)
1039                         goto lock_and_exit;
1040
1041                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1042                         ret = -ENOSPC;
1043                         goto lock_and_exit;
1044                 }
1045
1046                 spin_lock(&sbi->s_fc_lock);
1047         }
1048         return 0;
1049 lock_and_exit:
1050         spin_lock(&sbi->s_fc_lock);
1051         return ret;
1052 }
1053
1054 static int ext4_fc_perform_commit(journal_t *journal)
1055 {
1056         struct super_block *sb = (struct super_block *)(journal->j_private);
1057         struct ext4_sb_info *sbi = EXT4_SB(sb);
1058         struct ext4_inode_info *iter;
1059         struct ext4_fc_head head;
1060         struct inode *inode;
1061         struct blk_plug plug;
1062         int ret = 0;
1063         u32 crc = 0;
1064
1065         ret = ext4_fc_submit_inode_data_all(journal);
1066         if (ret)
1067                 return ret;
1068
1069         ret = ext4_fc_wait_inode_data_all(journal);
1070         if (ret)
1071                 return ret;
1072
1073         /*
1074          * If file system device is different from journal device, issue a cache
1075          * flush before we start writing fast commit blocks.
1076          */
1077         if (journal->j_fs_dev != journal->j_dev)
1078                 blkdev_issue_flush(journal->j_fs_dev);
1079
1080         blk_start_plug(&plug);
1081         if (sbi->s_fc_bytes == 0) {
1082                 /*
1083                  * Add a head tag only if this is the first fast commit
1084                  * in this TID.
1085                  */
1086                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1087                 head.fc_tid = cpu_to_le32(
1088                         sbi->s_journal->j_running_transaction->t_tid);
1089                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1090                         (u8 *)&head, &crc)) {
1091                         ret = -ENOSPC;
1092                         goto out;
1093                 }
1094         }
1095
1096         spin_lock(&sbi->s_fc_lock);
1097         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1098         if (ret) {
1099                 spin_unlock(&sbi->s_fc_lock);
1100                 goto out;
1101         }
1102
1103         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1104                 inode = &iter->vfs_inode;
1105                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106                         continue;
1107
1108                 spin_unlock(&sbi->s_fc_lock);
1109                 ret = ext4_fc_write_inode_data(inode, &crc);
1110                 if (ret)
1111                         goto out;
1112                 ret = ext4_fc_write_inode(inode, &crc);
1113                 if (ret)
1114                         goto out;
1115                 spin_lock(&sbi->s_fc_lock);
1116         }
1117         spin_unlock(&sbi->s_fc_lock);
1118
1119         ret = ext4_fc_write_tail(sb, crc);
1120
1121 out:
1122         blk_finish_plug(&plug);
1123         return ret;
1124 }
1125
1126 /*
1127  * The main commit entry point. Performs a fast commit for transaction
1128  * commit_tid if needed. If it's not possible to perform a fast commit
1129  * due to various reasons, we fall back to full commit. Returns 0
1130  * on success, error otherwise.
1131  */
1132 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133 {
1134         struct super_block *sb = (struct super_block *)(journal->j_private);
1135         struct ext4_sb_info *sbi = EXT4_SB(sb);
1136         int nblks = 0, ret, bsize = journal->j_blocksize;
1137         int subtid = atomic_read(&sbi->s_fc_subtid);
1138         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139         ktime_t start_time, commit_time;
1140
1141         trace_ext4_fc_commit_start(sb);
1142
1143         start_time = ktime_get();
1144
1145         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146                 (ext4_fc_is_ineligible(sb))) {
1147                 reason = EXT4_FC_REASON_INELIGIBLE;
1148                 goto out;
1149         }
1150
1151 restart_fc:
1152         ret = jbd2_fc_begin_commit(journal, commit_tid);
1153         if (ret == -EALREADY) {
1154                 /* There was an ongoing commit, check if we need to restart */
1155                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156                         commit_tid > journal->j_commit_sequence)
1157                         goto restart_fc;
1158                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159                 goto out;
1160         } else if (ret) {
1161                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162                 reason = EXT4_FC_REASON_FC_START_FAILED;
1163                 goto out;
1164         }
1165
1166         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167         ret = ext4_fc_perform_commit(journal);
1168         if (ret < 0) {
1169                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170                 reason = EXT4_FC_REASON_FC_FAILED;
1171                 goto out;
1172         }
1173         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174         ret = jbd2_fc_wait_bufs(journal, nblks);
1175         if (ret < 0) {
1176                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177                 reason = EXT4_FC_REASON_FC_FAILED;
1178                 goto out;
1179         }
1180         atomic_inc(&sbi->s_fc_subtid);
1181         jbd2_fc_end_commit(journal);
1182 out:
1183         /* Has any ineligible update happened since we started? */
1184         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186                 reason = EXT4_FC_REASON_INELIGIBLE;
1187         }
1188
1189         spin_lock(&sbi->s_fc_lock);
1190         if (reason != EXT4_FC_REASON_OK &&
1191                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192                 sbi->s_fc_stats.fc_ineligible_commits++;
1193         } else {
1194                 sbi->s_fc_stats.fc_num_commits++;
1195                 sbi->s_fc_stats.fc_numblks += nblks;
1196         }
1197         spin_unlock(&sbi->s_fc_lock);
1198         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199         trace_ext4_fc_commit_stop(sb, nblks, reason);
1200         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201         /*
1202          * weight the commit time higher than the average time so we don't
1203          * react too strongly to vast changes in the commit time
1204          */
1205         if (likely(sbi->s_fc_avg_commit_time))
1206                 sbi->s_fc_avg_commit_time = (commit_time +
1207                                 sbi->s_fc_avg_commit_time * 3) / 4;
1208         else
1209                 sbi->s_fc_avg_commit_time = commit_time;
1210         jbd_debug(1,
1211                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212                 nblks, reason, subtid);
1213         if (reason == EXT4_FC_REASON_FC_FAILED)
1214                 return jbd2_fc_end_commit_fallback(journal);
1215         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216                 reason == EXT4_FC_REASON_INELIGIBLE)
1217                 return jbd2_complete_transaction(journal, commit_tid);
1218         return 0;
1219 }
1220
1221 /*
1222  * Fast commit cleanup routine. This is called after every fast commit and
1223  * full commit. full is true if we are called after a full commit.
1224  */
1225 static void ext4_fc_cleanup(journal_t *journal, int full)
1226 {
1227         struct super_block *sb = journal->j_private;
1228         struct ext4_sb_info *sbi = EXT4_SB(sb);
1229         struct ext4_inode_info *iter, *iter_n;
1230         struct ext4_fc_dentry_update *fc_dentry;
1231
1232         if (full && sbi->s_fc_bh)
1233                 sbi->s_fc_bh = NULL;
1234
1235         jbd2_fc_release_bufs(journal);
1236
1237         spin_lock(&sbi->s_fc_lock);
1238         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1239                                  i_fc_list) {
1240                 list_del_init(&iter->i_fc_list);
1241                 ext4_clear_inode_state(&iter->vfs_inode,
1242                                        EXT4_STATE_FC_COMMITTING);
1243                 ext4_fc_reset_inode(&iter->vfs_inode);
1244                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1245                 smp_mb();
1246 #if (BITS_PER_LONG < 64)
1247                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1248 #else
1249                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1250 #endif
1251         }
1252
1253         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1254                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1255                                              struct ext4_fc_dentry_update,
1256                                              fcd_list);
1257                 list_del_init(&fc_dentry->fcd_list);
1258                 spin_unlock(&sbi->s_fc_lock);
1259
1260                 if (fc_dentry->fcd_name.name &&
1261                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1262                         kfree(fc_dentry->fcd_name.name);
1263                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1264                 spin_lock(&sbi->s_fc_lock);
1265         }
1266
1267         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1268                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1269         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1270                                 &sbi->s_fc_q[FC_Q_MAIN]);
1271
1272         ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1273         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1274
1275         if (full)
1276                 sbi->s_fc_bytes = 0;
1277         spin_unlock(&sbi->s_fc_lock);
1278         trace_ext4_fc_stats(sb);
1279 }
1280
1281 /* Ext4 Replay Path Routines */
1282
1283 /* Helper struct for dentry replay routines */
1284 struct dentry_info_args {
1285         int parent_ino, dname_len, ino, inode_len;
1286         char *dname;
1287 };
1288
1289 static inline void tl_to_darg(struct dentry_info_args *darg,
1290                               struct  ext4_fc_tl *tl, u8 *val)
1291 {
1292         struct ext4_fc_dentry_info fcd;
1293
1294         memcpy(&fcd, val, sizeof(fcd));
1295
1296         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1297         darg->ino = le32_to_cpu(fcd.fc_ino);
1298         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1299         darg->dname_len = le16_to_cpu(tl->fc_len) -
1300                 sizeof(struct ext4_fc_dentry_info);
1301 }
1302
1303 /* Unlink replay function */
1304 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1305                                  u8 *val)
1306 {
1307         struct inode *inode, *old_parent;
1308         struct qstr entry;
1309         struct dentry_info_args darg;
1310         int ret = 0;
1311
1312         tl_to_darg(&darg, tl, val);
1313
1314         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1315                         darg.parent_ino, darg.dname_len);
1316
1317         entry.name = darg.dname;
1318         entry.len = darg.dname_len;
1319         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1320
1321         if (IS_ERR(inode)) {
1322                 jbd_debug(1, "Inode %d not found", darg.ino);
1323                 return 0;
1324         }
1325
1326         old_parent = ext4_iget(sb, darg.parent_ino,
1327                                 EXT4_IGET_NORMAL);
1328         if (IS_ERR(old_parent)) {
1329                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1330                 iput(inode);
1331                 return 0;
1332         }
1333
1334         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1335         /* -ENOENT ok coz it might not exist anymore. */
1336         if (ret == -ENOENT)
1337                 ret = 0;
1338         iput(old_parent);
1339         iput(inode);
1340         return ret;
1341 }
1342
1343 static int ext4_fc_replay_link_internal(struct super_block *sb,
1344                                 struct dentry_info_args *darg,
1345                                 struct inode *inode)
1346 {
1347         struct inode *dir = NULL;
1348         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1349         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1350         int ret = 0;
1351
1352         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1353         if (IS_ERR(dir)) {
1354                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1355                 dir = NULL;
1356                 goto out;
1357         }
1358
1359         dentry_dir = d_obtain_alias(dir);
1360         if (IS_ERR(dentry_dir)) {
1361                 jbd_debug(1, "Failed to obtain dentry");
1362                 dentry_dir = NULL;
1363                 goto out;
1364         }
1365
1366         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1367         if (!dentry_inode) {
1368                 jbd_debug(1, "Inode dentry not created.");
1369                 ret = -ENOMEM;
1370                 goto out;
1371         }
1372
1373         ret = __ext4_link(dir, inode, dentry_inode);
1374         /*
1375          * It's possible that link already existed since data blocks
1376          * for the dir in question got persisted before we crashed OR
1377          * we replayed this tag and crashed before the entire replay
1378          * could complete.
1379          */
1380         if (ret && ret != -EEXIST) {
1381                 jbd_debug(1, "Failed to link\n");
1382                 goto out;
1383         }
1384
1385         ret = 0;
1386 out:
1387         if (dentry_dir) {
1388                 d_drop(dentry_dir);
1389                 dput(dentry_dir);
1390         } else if (dir) {
1391                 iput(dir);
1392         }
1393         if (dentry_inode) {
1394                 d_drop(dentry_inode);
1395                 dput(dentry_inode);
1396         }
1397
1398         return ret;
1399 }
1400
1401 /* Link replay function */
1402 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1403                                u8 *val)
1404 {
1405         struct inode *inode;
1406         struct dentry_info_args darg;
1407         int ret = 0;
1408
1409         tl_to_darg(&darg, tl, val);
1410         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1411                         darg.parent_ino, darg.dname_len);
1412
1413         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1414         if (IS_ERR(inode)) {
1415                 jbd_debug(1, "Inode not found.");
1416                 return 0;
1417         }
1418
1419         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1420         iput(inode);
1421         return ret;
1422 }
1423
1424 /*
1425  * Record all the modified inodes during replay. We use this later to setup
1426  * block bitmaps correctly.
1427  */
1428 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1429 {
1430         struct ext4_fc_replay_state *state;
1431         int i;
1432
1433         state = &EXT4_SB(sb)->s_fc_replay_state;
1434         for (i = 0; i < state->fc_modified_inodes_used; i++)
1435                 if (state->fc_modified_inodes[i] == ino)
1436                         return 0;
1437         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1438                 state->fc_modified_inodes_size +=
1439                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1440                 state->fc_modified_inodes = krealloc(
1441                                         state->fc_modified_inodes, sizeof(int) *
1442                                         state->fc_modified_inodes_size,
1443                                         GFP_KERNEL);
1444                 if (!state->fc_modified_inodes)
1445                         return -ENOMEM;
1446         }
1447         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1448         return 0;
1449 }
1450
1451 /*
1452  * Inode replay function
1453  */
1454 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1455                                 u8 *val)
1456 {
1457         struct ext4_fc_inode fc_inode;
1458         struct ext4_inode *raw_inode;
1459         struct ext4_inode *raw_fc_inode;
1460         struct inode *inode = NULL;
1461         struct ext4_iloc iloc;
1462         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1463         struct ext4_extent_header *eh;
1464
1465         memcpy(&fc_inode, val, sizeof(fc_inode));
1466
1467         ino = le32_to_cpu(fc_inode.fc_ino);
1468         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1469
1470         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1471         if (!IS_ERR(inode)) {
1472                 ext4_ext_clear_bb(inode);
1473                 iput(inode);
1474         }
1475         inode = NULL;
1476
1477         ext4_fc_record_modified_inode(sb, ino);
1478
1479         raw_fc_inode = (struct ext4_inode *)
1480                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1481         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1482         if (ret)
1483                 goto out;
1484
1485         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1486         raw_inode = ext4_raw_inode(&iloc);
1487
1488         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1489         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1490                 inode_len - offsetof(struct ext4_inode, i_generation));
1491         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1492                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1493                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1494                         memset(eh, 0, sizeof(*eh));
1495                         eh->eh_magic = EXT4_EXT_MAGIC;
1496                         eh->eh_max = cpu_to_le16(
1497                                 (sizeof(raw_inode->i_block) -
1498                                  sizeof(struct ext4_extent_header))
1499                                  / sizeof(struct ext4_extent));
1500                 }
1501         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1502                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1503                         sizeof(raw_inode->i_block));
1504         }
1505
1506         /* Immediately update the inode on disk. */
1507         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1508         if (ret)
1509                 goto out;
1510         ret = sync_dirty_buffer(iloc.bh);
1511         if (ret)
1512                 goto out;
1513         ret = ext4_mark_inode_used(sb, ino);
1514         if (ret)
1515                 goto out;
1516
1517         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1518         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1519         if (IS_ERR(inode)) {
1520                 jbd_debug(1, "Inode not found.");
1521                 return -EFSCORRUPTED;
1522         }
1523
1524         /*
1525          * Our allocator could have made different decisions than before
1526          * crashing. This should be fixed but until then, we calculate
1527          * the number of blocks the inode.
1528          */
1529         if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1530                 ext4_ext_replay_set_iblocks(inode);
1531
1532         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1533         ext4_reset_inode_seed(inode);
1534
1535         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1536         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1537         sync_dirty_buffer(iloc.bh);
1538         brelse(iloc.bh);
1539 out:
1540         iput(inode);
1541         if (!ret)
1542                 blkdev_issue_flush(sb->s_bdev);
1543
1544         return 0;
1545 }
1546
1547 /*
1548  * Dentry create replay function.
1549  *
1550  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1551  * inode for which we are trying to create a dentry here, should already have
1552  * been replayed before we start here.
1553  */
1554 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1555                                  u8 *val)
1556 {
1557         int ret = 0;
1558         struct inode *inode = NULL;
1559         struct inode *dir = NULL;
1560         struct dentry_info_args darg;
1561
1562         tl_to_darg(&darg, tl, val);
1563
1564         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1565                         darg.parent_ino, darg.dname_len);
1566
1567         /* This takes care of update group descriptor and other metadata */
1568         ret = ext4_mark_inode_used(sb, darg.ino);
1569         if (ret)
1570                 goto out;
1571
1572         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1573         if (IS_ERR(inode)) {
1574                 jbd_debug(1, "inode %d not found.", darg.ino);
1575                 inode = NULL;
1576                 ret = -EINVAL;
1577                 goto out;
1578         }
1579
1580         if (S_ISDIR(inode->i_mode)) {
1581                 /*
1582                  * If we are creating a directory, we need to make sure that the
1583                  * dot and dot dot dirents are setup properly.
1584                  */
1585                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1586                 if (IS_ERR(dir)) {
1587                         jbd_debug(1, "Dir %d not found.", darg.ino);
1588                         goto out;
1589                 }
1590                 ret = ext4_init_new_dir(NULL, dir, inode);
1591                 iput(dir);
1592                 if (ret) {
1593                         ret = 0;
1594                         goto out;
1595                 }
1596         }
1597         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1598         if (ret)
1599                 goto out;
1600         set_nlink(inode, 1);
1601         ext4_mark_inode_dirty(NULL, inode);
1602 out:
1603         if (inode)
1604                 iput(inode);
1605         return ret;
1606 }
1607
1608 /*
1609  * Record physical disk regions which are in use as per fast commit area. Our
1610  * simple replay phase allocator excludes these regions from allocation.
1611  */
1612 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1613                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1614 {
1615         struct ext4_fc_replay_state *state;
1616         struct ext4_fc_alloc_region *region;
1617
1618         state = &EXT4_SB(sb)->s_fc_replay_state;
1619         if (state->fc_regions_used == state->fc_regions_size) {
1620                 state->fc_regions_size +=
1621                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1622                 state->fc_regions = krealloc(
1623                                         state->fc_regions,
1624                                         state->fc_regions_size *
1625                                         sizeof(struct ext4_fc_alloc_region),
1626                                         GFP_KERNEL);
1627                 if (!state->fc_regions)
1628                         return -ENOMEM;
1629         }
1630         region = &state->fc_regions[state->fc_regions_used++];
1631         region->ino = ino;
1632         region->lblk = lblk;
1633         region->pblk = pblk;
1634         region->len = len;
1635
1636         return 0;
1637 }
1638
1639 /* Replay add range tag */
1640 static int ext4_fc_replay_add_range(struct super_block *sb,
1641                                     struct ext4_fc_tl *tl, u8 *val)
1642 {
1643         struct ext4_fc_add_range fc_add_ex;
1644         struct ext4_extent newex, *ex;
1645         struct inode *inode;
1646         ext4_lblk_t start, cur;
1647         int remaining, len;
1648         ext4_fsblk_t start_pblk;
1649         struct ext4_map_blocks map;
1650         struct ext4_ext_path *path = NULL;
1651         int ret;
1652
1653         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1654         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1655
1656         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1657                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1658                 ext4_ext_get_actual_len(ex));
1659
1660         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1661         if (IS_ERR(inode)) {
1662                 jbd_debug(1, "Inode not found.");
1663                 return 0;
1664         }
1665
1666         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1667
1668         start = le32_to_cpu(ex->ee_block);
1669         start_pblk = ext4_ext_pblock(ex);
1670         len = ext4_ext_get_actual_len(ex);
1671
1672         cur = start;
1673         remaining = len;
1674         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1675                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1676                   inode->i_ino);
1677
1678         while (remaining > 0) {
1679                 map.m_lblk = cur;
1680                 map.m_len = remaining;
1681                 map.m_pblk = 0;
1682                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1683
1684                 if (ret < 0) {
1685                         iput(inode);
1686                         return 0;
1687                 }
1688
1689                 if (ret == 0) {
1690                         /* Range is not mapped */
1691                         path = ext4_find_extent(inode, cur, NULL, 0);
1692                         if (IS_ERR(path)) {
1693                                 iput(inode);
1694                                 return 0;
1695                         }
1696                         memset(&newex, 0, sizeof(newex));
1697                         newex.ee_block = cpu_to_le32(cur);
1698                         ext4_ext_store_pblock(
1699                                 &newex, start_pblk + cur - start);
1700                         newex.ee_len = cpu_to_le16(map.m_len);
1701                         if (ext4_ext_is_unwritten(ex))
1702                                 ext4_ext_mark_unwritten(&newex);
1703                         down_write(&EXT4_I(inode)->i_data_sem);
1704                         ret = ext4_ext_insert_extent(
1705                                 NULL, inode, &path, &newex, 0);
1706                         up_write((&EXT4_I(inode)->i_data_sem));
1707                         ext4_ext_drop_refs(path);
1708                         kfree(path);
1709                         if (ret) {
1710                                 iput(inode);
1711                                 return 0;
1712                         }
1713                         goto next;
1714                 }
1715
1716                 if (start_pblk + cur - start != map.m_pblk) {
1717                         /*
1718                          * Logical to physical mapping changed. This can happen
1719                          * if this range was removed and then reallocated to
1720                          * map to new physical blocks during a fast commit.
1721                          */
1722                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1723                                         ext4_ext_is_unwritten(ex),
1724                                         start_pblk + cur - start);
1725                         if (ret) {
1726                                 iput(inode);
1727                                 return 0;
1728                         }
1729                         /*
1730                          * Mark the old blocks as free since they aren't used
1731                          * anymore. We maintain an array of all the modified
1732                          * inodes. In case these blocks are still used at either
1733                          * a different logical range in the same inode or in
1734                          * some different inode, we will mark them as allocated
1735                          * at the end of the FC replay using our array of
1736                          * modified inodes.
1737                          */
1738                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1739                         goto next;
1740                 }
1741
1742                 /* Range is mapped and needs a state change */
1743                 jbd_debug(1, "Converting from %ld to %d %lld",
1744                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1745                         ext4_ext_is_unwritten(ex), map.m_pblk);
1746                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1747                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1748                 if (ret) {
1749                         iput(inode);
1750                         return 0;
1751                 }
1752                 /*
1753                  * We may have split the extent tree while toggling the state.
1754                  * Try to shrink the extent tree now.
1755                  */
1756                 ext4_ext_replay_shrink_inode(inode, start + len);
1757 next:
1758                 cur += map.m_len;
1759                 remaining -= map.m_len;
1760         }
1761         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1762                                         sb->s_blocksize_bits);
1763         iput(inode);
1764         return 0;
1765 }
1766
1767 /* Replay DEL_RANGE tag */
1768 static int
1769 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1770                          u8 *val)
1771 {
1772         struct inode *inode;
1773         struct ext4_fc_del_range lrange;
1774         struct ext4_map_blocks map;
1775         ext4_lblk_t cur, remaining;
1776         int ret;
1777
1778         memcpy(&lrange, val, sizeof(lrange));
1779         cur = le32_to_cpu(lrange.fc_lblk);
1780         remaining = le32_to_cpu(lrange.fc_len);
1781
1782         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1783                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1784
1785         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1786         if (IS_ERR(inode)) {
1787                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1788                 return 0;
1789         }
1790
1791         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1792
1793         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1794                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1795                         le32_to_cpu(lrange.fc_len));
1796         while (remaining > 0) {
1797                 map.m_lblk = cur;
1798                 map.m_len = remaining;
1799
1800                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1801                 if (ret < 0) {
1802                         iput(inode);
1803                         return 0;
1804                 }
1805                 if (ret > 0) {
1806                         remaining -= ret;
1807                         cur += ret;
1808                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1809                 } else {
1810                         remaining -= map.m_len;
1811                         cur += map.m_len;
1812                 }
1813         }
1814
1815         ret = ext4_punch_hole(inode,
1816                 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1817                 le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1818         if (ret)
1819                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1820         ext4_ext_replay_shrink_inode(inode,
1821                 i_size_read(inode) >> sb->s_blocksize_bits);
1822         ext4_mark_inode_dirty(NULL, inode);
1823         iput(inode);
1824
1825         return 0;
1826 }
1827
1828 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1829 {
1830         struct ext4_fc_replay_state *state;
1831         struct inode *inode;
1832         struct ext4_ext_path *path = NULL;
1833         struct ext4_map_blocks map;
1834         int i, ret, j;
1835         ext4_lblk_t cur, end;
1836
1837         state = &EXT4_SB(sb)->s_fc_replay_state;
1838         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1839                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1840                         EXT4_IGET_NORMAL);
1841                 if (IS_ERR(inode)) {
1842                         jbd_debug(1, "Inode %d not found.",
1843                                 state->fc_modified_inodes[i]);
1844                         continue;
1845                 }
1846                 cur = 0;
1847                 end = EXT_MAX_BLOCKS;
1848                 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1849                         iput(inode);
1850                         continue;
1851                 }
1852                 while (cur < end) {
1853                         map.m_lblk = cur;
1854                         map.m_len = end - cur;
1855
1856                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1857                         if (ret < 0)
1858                                 break;
1859
1860                         if (ret > 0) {
1861                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1862                                 if (!IS_ERR(path)) {
1863                                         for (j = 0; j < path->p_depth; j++)
1864                                                 ext4_mb_mark_bb(inode->i_sb,
1865                                                         path[j].p_block, 1, 1);
1866                                         ext4_ext_drop_refs(path);
1867                                         kfree(path);
1868                                 }
1869                                 cur += ret;
1870                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1871                                                         map.m_len, 1);
1872                         } else {
1873                                 cur = cur + (map.m_len ? map.m_len : 1);
1874                         }
1875                 }
1876                 iput(inode);
1877         }
1878 }
1879
1880 /*
1881  * Check if block is in excluded regions for block allocation. The simple
1882  * allocator that runs during replay phase is calls this function to see
1883  * if it is okay to use a block.
1884  */
1885 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1886 {
1887         int i;
1888         struct ext4_fc_replay_state *state;
1889
1890         state = &EXT4_SB(sb)->s_fc_replay_state;
1891         for (i = 0; i < state->fc_regions_valid; i++) {
1892                 if (state->fc_regions[i].ino == 0 ||
1893                         state->fc_regions[i].len == 0)
1894                         continue;
1895                 if (blk >= state->fc_regions[i].pblk &&
1896                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1897                         return true;
1898         }
1899         return false;
1900 }
1901
1902 /* Cleanup function called after replay */
1903 void ext4_fc_replay_cleanup(struct super_block *sb)
1904 {
1905         struct ext4_sb_info *sbi = EXT4_SB(sb);
1906
1907         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1908         kfree(sbi->s_fc_replay_state.fc_regions);
1909         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1910 }
1911
1912 /*
1913  * Recovery Scan phase handler
1914  *
1915  * This function is called during the scan phase and is responsible
1916  * for doing following things:
1917  * - Make sure the fast commit area has valid tags for replay
1918  * - Count number of tags that need to be replayed by the replay handler
1919  * - Verify CRC
1920  * - Create a list of excluded blocks for allocation during replay phase
1921  *
1922  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1923  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1924  * to indicate that scan has finished and JBD2 can now start replay phase.
1925  * It returns a negative error to indicate that there was an error. At the end
1926  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1927  * to indicate the number of tags that need to replayed during the replay phase.
1928  */
1929 static int ext4_fc_replay_scan(journal_t *journal,
1930                                 struct buffer_head *bh, int off,
1931                                 tid_t expected_tid)
1932 {
1933         struct super_block *sb = journal->j_private;
1934         struct ext4_sb_info *sbi = EXT4_SB(sb);
1935         struct ext4_fc_replay_state *state;
1936         int ret = JBD2_FC_REPLAY_CONTINUE;
1937         struct ext4_fc_add_range ext;
1938         struct ext4_fc_tl tl;
1939         struct ext4_fc_tail tail;
1940         __u8 *start, *end, *cur, *val;
1941         struct ext4_fc_head head;
1942         struct ext4_extent *ex;
1943
1944         state = &sbi->s_fc_replay_state;
1945
1946         start = (u8 *)bh->b_data;
1947         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1948
1949         if (state->fc_replay_expected_off == 0) {
1950                 state->fc_cur_tag = 0;
1951                 state->fc_replay_num_tags = 0;
1952                 state->fc_crc = 0;
1953                 state->fc_regions = NULL;
1954                 state->fc_regions_valid = state->fc_regions_used =
1955                         state->fc_regions_size = 0;
1956                 /* Check if we can stop early */
1957                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1958                         != EXT4_FC_TAG_HEAD)
1959                         return 0;
1960         }
1961
1962         if (off != state->fc_replay_expected_off) {
1963                 ret = -EFSCORRUPTED;
1964                 goto out_err;
1965         }
1966
1967         state->fc_replay_expected_off++;
1968         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1969                 memcpy(&tl, cur, sizeof(tl));
1970                 val = cur + sizeof(tl);
1971                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1972                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1973                 switch (le16_to_cpu(tl.fc_tag)) {
1974                 case EXT4_FC_TAG_ADD_RANGE:
1975                         memcpy(&ext, val, sizeof(ext));
1976                         ex = (struct ext4_extent *)&ext.fc_ex;
1977                         ret = ext4_fc_record_regions(sb,
1978                                 le32_to_cpu(ext.fc_ino),
1979                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1980                                 ext4_ext_get_actual_len(ex));
1981                         if (ret < 0)
1982                                 break;
1983                         ret = JBD2_FC_REPLAY_CONTINUE;
1984                         fallthrough;
1985                 case EXT4_FC_TAG_DEL_RANGE:
1986                 case EXT4_FC_TAG_LINK:
1987                 case EXT4_FC_TAG_UNLINK:
1988                 case EXT4_FC_TAG_CREAT:
1989                 case EXT4_FC_TAG_INODE:
1990                 case EXT4_FC_TAG_PAD:
1991                         state->fc_cur_tag++;
1992                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1993                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
1994                         break;
1995                 case EXT4_FC_TAG_TAIL:
1996                         state->fc_cur_tag++;
1997                         memcpy(&tail, val, sizeof(tail));
1998                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1999                                                 sizeof(tl) +
2000                                                 offsetof(struct ext4_fc_tail,
2001                                                 fc_crc));
2002                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2003                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2004                                 state->fc_replay_num_tags = state->fc_cur_tag;
2005                                 state->fc_regions_valid =
2006                                         state->fc_regions_used;
2007                         } else {
2008                                 ret = state->fc_replay_num_tags ?
2009                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2010                         }
2011                         state->fc_crc = 0;
2012                         break;
2013                 case EXT4_FC_TAG_HEAD:
2014                         memcpy(&head, val, sizeof(head));
2015                         if (le32_to_cpu(head.fc_features) &
2016                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2017                                 ret = -EOPNOTSUPP;
2018                                 break;
2019                         }
2020                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2021                                 ret = JBD2_FC_REPLAY_STOP;
2022                                 break;
2023                         }
2024                         state->fc_cur_tag++;
2025                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2026                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2027                         break;
2028                 default:
2029                         ret = state->fc_replay_num_tags ?
2030                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2031                 }
2032                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2033                         break;
2034         }
2035
2036 out_err:
2037         trace_ext4_fc_replay_scan(sb, ret, off);
2038         return ret;
2039 }
2040
2041 /*
2042  * Main recovery path entry point.
2043  * The meaning of return codes is similar as above.
2044  */
2045 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2046                                 enum passtype pass, int off, tid_t expected_tid)
2047 {
2048         struct super_block *sb = journal->j_private;
2049         struct ext4_sb_info *sbi = EXT4_SB(sb);
2050         struct ext4_fc_tl tl;
2051         __u8 *start, *end, *cur, *val;
2052         int ret = JBD2_FC_REPLAY_CONTINUE;
2053         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2054         struct ext4_fc_tail tail;
2055
2056         if (pass == PASS_SCAN) {
2057                 state->fc_current_pass = PASS_SCAN;
2058                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2059         }
2060
2061         if (state->fc_current_pass != pass) {
2062                 state->fc_current_pass = pass;
2063                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2064         }
2065         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2066                 jbd_debug(1, "Replay stops\n");
2067                 ext4_fc_set_bitmaps_and_counters(sb);
2068                 return 0;
2069         }
2070
2071 #ifdef CONFIG_EXT4_DEBUG
2072         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2073                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2074                 return JBD2_FC_REPLAY_STOP;
2075         }
2076 #endif
2077
2078         start = (u8 *)bh->b_data;
2079         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2080
2081         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2082                 memcpy(&tl, cur, sizeof(tl));
2083                 val = cur + sizeof(tl);
2084
2085                 if (state->fc_replay_num_tags == 0) {
2086                         ret = JBD2_FC_REPLAY_STOP;
2087                         ext4_fc_set_bitmaps_and_counters(sb);
2088                         break;
2089                 }
2090                 jbd_debug(3, "Replay phase, tag:%s\n",
2091                                 tag2str(le16_to_cpu(tl.fc_tag)));
2092                 state->fc_replay_num_tags--;
2093                 switch (le16_to_cpu(tl.fc_tag)) {
2094                 case EXT4_FC_TAG_LINK:
2095                         ret = ext4_fc_replay_link(sb, &tl, val);
2096                         break;
2097                 case EXT4_FC_TAG_UNLINK:
2098                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2099                         break;
2100                 case EXT4_FC_TAG_ADD_RANGE:
2101                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2102                         break;
2103                 case EXT4_FC_TAG_CREAT:
2104                         ret = ext4_fc_replay_create(sb, &tl, val);
2105                         break;
2106                 case EXT4_FC_TAG_DEL_RANGE:
2107                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2108                         break;
2109                 case EXT4_FC_TAG_INODE:
2110                         ret = ext4_fc_replay_inode(sb, &tl, val);
2111                         break;
2112                 case EXT4_FC_TAG_PAD:
2113                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2114                                              le16_to_cpu(tl.fc_len), 0);
2115                         break;
2116                 case EXT4_FC_TAG_TAIL:
2117                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2118                                              le16_to_cpu(tl.fc_len), 0);
2119                         memcpy(&tail, val, sizeof(tail));
2120                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2121                         break;
2122                 case EXT4_FC_TAG_HEAD:
2123                         break;
2124                 default:
2125                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2126                                              le16_to_cpu(tl.fc_len), 0);
2127                         ret = -ECANCELED;
2128                         break;
2129                 }
2130                 if (ret < 0)
2131                         break;
2132                 ret = JBD2_FC_REPLAY_CONTINUE;
2133         }
2134         return ret;
2135 }
2136
2137 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2138 {
2139         /*
2140          * We set replay callback even if fast commit disabled because we may
2141          * could still have fast commit blocks that need to be replayed even if
2142          * fast commit has now been turned off.
2143          */
2144         journal->j_fc_replay_callback = ext4_fc_replay;
2145         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2146                 return;
2147         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2148 }
2149
2150 static const char *fc_ineligible_reasons[] = {
2151         "Extended attributes changed",
2152         "Cross rename",
2153         "Journal flag changed",
2154         "Insufficient memory",
2155         "Swap boot",
2156         "Resize",
2157         "Dir renamed",
2158         "Falloc range op",
2159         "Data journalling",
2160         "FC Commit Failed"
2161 };
2162
2163 int ext4_fc_info_show(struct seq_file *seq, void *v)
2164 {
2165         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2166         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2167         int i;
2168
2169         if (v != SEQ_START_TOKEN)
2170                 return 0;
2171
2172         seq_printf(seq,
2173                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2174                    stats->fc_num_commits, stats->fc_ineligible_commits,
2175                    stats->fc_numblks,
2176                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2177         seq_puts(seq, "Ineligible reasons:\n");
2178         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2179                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2180                         stats->fc_ineligible_reason_count[i]);
2181
2182         return 0;
2183 }
2184
2185 int __init ext4_fc_init_dentry_cache(void)
2186 {
2187         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2188                                            SLAB_RECLAIM_ACCOUNT);
2189
2190         if (ext4_fc_dentry_cachep == NULL)
2191                 return -ENOMEM;
2192
2193         return 0;
2194 }
This page took 0.164234 seconds and 4 git commands to generate.