]> Git Repo - linux.git/blob - fs/bcachefs/journal_types.h
Linux 6.14-rc3
[linux.git] / fs / bcachefs / journal_types.h
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _BCACHEFS_JOURNAL_TYPES_H
3 #define _BCACHEFS_JOURNAL_TYPES_H
4
5 #include <linux/cache.h>
6 #include <linux/workqueue.h>
7
8 #include "alloc_types.h"
9 #include "super_types.h"
10 #include "fifo.h"
11
12 /* btree write buffer steals 8 bits for its own purposes: */
13 #define JOURNAL_SEQ_MAX         ((1ULL << 56) - 1)
14
15 #define JOURNAL_BUF_BITS        2
16 #define JOURNAL_BUF_NR          (1U << JOURNAL_BUF_BITS)
17 #define JOURNAL_BUF_MASK        (JOURNAL_BUF_NR - 1)
18
19 /*
20  * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
21  * the journal that are being staged or in flight.
22  */
23 struct journal_buf {
24         struct closure          io;
25         struct jset             *data;
26
27         __BKEY_PADDED(key, BCH_REPLICAS_MAX);
28         struct bch_devs_list    devs_written;
29
30         struct closure_waitlist wait;
31         u64                     last_seq;       /* copy of data->last_seq */
32         long                    expires;
33         u64                     flush_time;
34
35         unsigned                buf_size;       /* size in bytes of @data */
36         unsigned                sectors;        /* maximum size for current entry */
37         unsigned                disk_sectors;   /* maximum size entry could have been, if
38                                                    buf_size was bigger */
39         unsigned                u64s_reserved;
40         bool                    noflush:1;      /* write has already been kicked off, and was noflush */
41         bool                    must_flush:1;   /* something wants a flush */
42         bool                    separate_flush:1;
43         bool                    need_flush_to_write_buffer:1;
44         bool                    write_started:1;
45         bool                    write_allocated:1;
46         bool                    write_done:1;
47         u8                      idx;
48 };
49
50 /*
51  * Something that makes a journal entry dirty - i.e. a btree node that has to be
52  * flushed:
53  */
54
55 enum journal_pin_type {
56         JOURNAL_PIN_TYPE_btree3,
57         JOURNAL_PIN_TYPE_btree2,
58         JOURNAL_PIN_TYPE_btree1,
59         JOURNAL_PIN_TYPE_btree0,
60         JOURNAL_PIN_TYPE_key_cache,
61         JOURNAL_PIN_TYPE_other,
62         JOURNAL_PIN_TYPE_NR,
63 };
64
65 struct journal_entry_pin_list {
66         struct list_head                unflushed[JOURNAL_PIN_TYPE_NR];
67         struct list_head                flushed[JOURNAL_PIN_TYPE_NR];
68         atomic_t                        count;
69         struct bch_devs_list            devs;
70 };
71
72 struct journal;
73 struct journal_entry_pin;
74 typedef int (*journal_pin_flush_fn)(struct journal *j,
75                                 struct journal_entry_pin *, u64);
76
77 struct journal_entry_pin {
78         struct list_head                list;
79         journal_pin_flush_fn            flush;
80         u64                             seq;
81 };
82
83 struct journal_res {
84         bool                    ref;
85         u8                      idx;
86         u16                     u64s;
87         u32                     offset;
88         u64                     seq;
89 };
90
91 union journal_res_state {
92         struct {
93                 atomic64_t      counter;
94         };
95
96         struct {
97                 u64             v;
98         };
99
100         struct {
101                 u64             cur_entry_offset:20,
102                                 idx:2,
103                                 unwritten_idx:2,
104                                 buf0_count:10,
105                                 buf1_count:10,
106                                 buf2_count:10,
107                                 buf3_count:10;
108         };
109 };
110
111 /* bytes: */
112 #define JOURNAL_ENTRY_SIZE_MIN          (64U << 10) /* 64k */
113 #define JOURNAL_ENTRY_SIZE_MAX          (4U  << 20) /* 4M */
114
115 /*
116  * We stash some journal state as sentinal values in cur_entry_offset:
117  * note - cur_entry_offset is in units of u64s
118  */
119 #define JOURNAL_ENTRY_OFFSET_MAX        ((1U << 20) - 1)
120
121 #define JOURNAL_ENTRY_BLOCKED_VAL       (JOURNAL_ENTRY_OFFSET_MAX - 2)
122 #define JOURNAL_ENTRY_CLOSED_VAL        (JOURNAL_ENTRY_OFFSET_MAX - 1)
123 #define JOURNAL_ENTRY_ERROR_VAL         (JOURNAL_ENTRY_OFFSET_MAX)
124
125 struct journal_space {
126         /* Units of 512 bytes sectors: */
127         unsigned        next_entry; /* How big the next journal entry can be */
128         unsigned        total;
129 };
130
131 enum journal_space_from {
132         journal_space_discarded,
133         journal_space_clean_ondisk,
134         journal_space_clean,
135         journal_space_total,
136         journal_space_nr,
137 };
138
139 #define JOURNAL_FLAGS()                 \
140         x(replay_done)                  \
141         x(running)                      \
142         x(may_skip_flush)               \
143         x(need_flush_write)             \
144         x(space_low)
145
146 enum journal_flags {
147 #define x(n)    JOURNAL_##n,
148         JOURNAL_FLAGS()
149 #undef x
150 };
151
152 /* Reasons we may fail to get a journal reservation: */
153 #define JOURNAL_ERRORS()                \
154         x(ok)                           \
155         x(retry)                        \
156         x(blocked)                      \
157         x(max_in_flight)                \
158         x(journal_full)                 \
159         x(journal_pin_full)             \
160         x(journal_stuck)                \
161         x(insufficient_devices)
162
163 enum journal_errors {
164 #define x(n)    JOURNAL_ERR_##n,
165         JOURNAL_ERRORS()
166 #undef x
167 };
168
169 typedef DARRAY(u64)             darray_u64;
170
171 struct journal_bio {
172         struct bch_dev          *ca;
173         unsigned                buf_idx;
174
175         struct bio              bio;
176 };
177
178 /* Embedded in struct bch_fs */
179 struct journal {
180         /* Fastpath stuff up front: */
181         struct {
182
183         union journal_res_state reservations;
184         enum bch_watermark      watermark;
185
186         } __aligned(SMP_CACHE_BYTES);
187
188         unsigned long           flags;
189
190         /* Max size of current journal entry */
191         unsigned                cur_entry_u64s;
192         unsigned                cur_entry_sectors;
193
194         /* Reserved space in journal entry to be used just prior to write */
195         unsigned                entry_u64s_reserved;
196
197
198         /*
199          * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
200          * insufficient devices:
201          */
202         enum journal_errors     cur_entry_error;
203         unsigned                cur_entry_offset_if_blocked;
204
205         unsigned                buf_size_want;
206         /*
207          * We may queue up some things to be journalled (log messages) before
208          * the journal has actually started - stash them here:
209          */
210         darray_u64              early_journal_entries;
211
212         /*
213          * Protects journal_buf->data, when accessing without a jorunal
214          * reservation: for synchronization between the btree write buffer code
215          * and the journal write path:
216          */
217         struct mutex            buf_lock;
218         /*
219          * Two journal entries -- one is currently open for new entries, the
220          * other is possibly being written out.
221          */
222         struct journal_buf      buf[JOURNAL_BUF_NR];
223
224         spinlock_t              lock;
225
226         /* if nonzero, we may not open a new journal entry: */
227         unsigned                blocked;
228
229         /* Used when waiting because the journal was full */
230         wait_queue_head_t       wait;
231         struct closure_waitlist async_wait;
232         struct closure_waitlist reclaim_flush_wait;
233
234         struct delayed_work     write_work;
235         struct workqueue_struct *wq;
236
237         /* Sequence number of most recent journal entry (last entry in @pin) */
238         atomic64_t              seq;
239
240         /* seq, last_seq from the most recent journal entry successfully written */
241         u64                     seq_ondisk;
242         u64                     flushed_seq_ondisk;
243         u64                     flushing_seq;
244         u64                     last_seq_ondisk;
245         u64                     err_seq;
246         u64                     last_empty_seq;
247         u64                     oldest_seq_found_ondisk;
248
249         /*
250          * FIFO of journal entries whose btree updates have not yet been
251          * written out.
252          *
253          * Each entry is a reference count. The position in the FIFO is the
254          * entry's sequence number relative to @seq.
255          *
256          * The journal entry itself holds a reference count, put when the
257          * journal entry is written out. Each btree node modified by the journal
258          * entry also holds a reference count, put when the btree node is
259          * written.
260          *
261          * When a reference count reaches zero, the journal entry is no longer
262          * needed. When all journal entries in the oldest journal bucket are no
263          * longer needed, the bucket can be discarded and reused.
264          */
265         struct {
266                 u64 front, back, size, mask;
267                 struct journal_entry_pin_list *data;
268         }                       pin;
269
270         struct journal_space    space[journal_space_nr];
271
272         u64                     replay_journal_seq;
273         u64                     replay_journal_seq_end;
274
275         struct write_point      wp;
276         spinlock_t              err_lock;
277
278         struct mutex            reclaim_lock;
279         /*
280          * Used for waiting until journal reclaim has freed up space in the
281          * journal:
282          */
283         wait_queue_head_t       reclaim_wait;
284         struct task_struct      *reclaim_thread;
285         bool                    reclaim_kicked;
286         unsigned long           next_reclaim;
287         u64                     nr_direct_reclaim;
288         u64                     nr_background_reclaim;
289
290         unsigned long           last_flushed;
291         struct journal_entry_pin *flush_in_progress;
292         bool                    flush_in_progress_dropped;
293         wait_queue_head_t       pin_flush_wait;
294
295         /* protects advancing ja->discard_idx: */
296         struct mutex            discard_lock;
297         bool                    can_discard;
298
299         unsigned long           last_flush_write;
300
301         u64                     write_start_time;
302
303         u64                     nr_flush_writes;
304         u64                     nr_noflush_writes;
305         u64                     entry_bytes_written;
306
307         struct bch2_time_stats  *flush_write_time;
308         struct bch2_time_stats  *noflush_write_time;
309         struct bch2_time_stats  *flush_seq_time;
310
311 #ifdef CONFIG_DEBUG_LOCK_ALLOC
312         struct lockdep_map      res_map;
313 #endif
314 } __aligned(SMP_CACHE_BYTES);
315
316 /*
317  * Embedded in struct bch_dev. First three fields refer to the array of journal
318  * buckets, in bch_sb.
319  */
320 struct journal_device {
321         /*
322          * For each journal bucket, contains the max sequence number of the
323          * journal writes it contains - so we know when a bucket can be reused.
324          */
325         u64                     *bucket_seq;
326
327         unsigned                sectors_free;
328
329         /*
330          * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
331          */
332         unsigned                discard_idx;            /* Next bucket to discard */
333         unsigned                dirty_idx_ondisk;
334         unsigned                dirty_idx;
335         unsigned                cur_idx;                /* Journal bucket we're currently writing to */
336         unsigned                nr;
337
338         u64                     *buckets;
339
340         /* Bio for journal reads/writes to this device */
341         struct journal_bio      *bio[JOURNAL_BUF_NR];
342
343         /* for bch_journal_read_device */
344         struct closure          read;
345         u64                     highest_seq_found;
346 };
347
348 /*
349  * journal_entry_res - reserve space in every journal entry:
350  */
351 struct journal_entry_res {
352         unsigned                u64s;
353 };
354
355 #endif /* _BCACHEFS_JOURNAL_TYPES_H */
This page took 0.048079 seconds and 4 git commands to generate.