]>
Commit | Line | Data |
---|---|---|
5db53f3e JE |
1 | /* |
2 | * fs/logfs/gc.c - garbage collection code | |
3 | * | |
4 | * As should be obvious for Linux kernel code, license is GPLv2 | |
5 | * | |
6 | * Copyright (c) 2005-2008 Joern Engel <[email protected]> | |
7 | */ | |
8 | #include "logfs.h" | |
9 | #include <linux/sched.h> | |
5a0e3ad6 | 10 | #include <linux/slab.h> |
5db53f3e JE |
11 | |
12 | /* | |
13 | * Wear leveling needs to kick in when the difference between low erase | |
14 | * counts and high erase counts gets too big. A good value for "too big" | |
15 | * may be somewhat below 10% of maximum erase count for the device. | |
16 | * Why not 397, to pick a nice round number with no specific meaning? :) | |
17 | * | |
18 | * WL_RATELIMIT is the minimum time between two wear level events. A huge | |
19 | * number of segments may fulfil the requirements for wear leveling at the | |
20 | * same time. If that happens we don't want to cause a latency from hell, | |
21 | * but just gently pick one segment every so often and minimize overhead. | |
22 | */ | |
23 | #define WL_DELTA 397 | |
24 | #define WL_RATELIMIT 100 | |
25 | #define MAX_OBJ_ALIASES 2600 | |
26 | #define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ | |
27 | #define LIST_SIZE 64 /* base size of candidate lists */ | |
28 | #define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ | |
29 | #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ | |
30 | ||
31 | static int no_free_segments(struct super_block *sb) | |
32 | { | |
33 | struct logfs_super *super = logfs_super(sb); | |
34 | ||
35 | return super->s_free_list.count; | |
36 | } | |
37 | ||
38 | /* journal has distance -1, top-most ifile layer distance 0 */ | |
39 | static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) | |
40 | { | |
41 | struct logfs_super *super = logfs_super(sb); | |
42 | u8 gc_level = (__force u8)__gc_level; | |
43 | ||
44 | switch (gc_level) { | |
45 | case 0: /* fall through */ | |
46 | case 1: /* fall through */ | |
47 | case 2: /* fall through */ | |
48 | case 3: | |
49 | /* file data or indirect blocks */ | |
50 | return super->s_ifile_levels + super->s_iblock_levels - gc_level; | |
51 | case 6: /* fall through */ | |
52 | case 7: /* fall through */ | |
53 | case 8: /* fall through */ | |
54 | case 9: | |
55 | /* inode file data or indirect blocks */ | |
56 | return super->s_ifile_levels - (gc_level - 6); | |
57 | default: | |
58 | printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", | |
59 | gc_level); | |
60 | WARN_ON(1); | |
61 | return super->s_ifile_levels + super->s_iblock_levels; | |
62 | } | |
63 | } | |
64 | ||
65 | static int segment_is_reserved(struct super_block *sb, u32 segno) | |
66 | { | |
67 | struct logfs_super *super = logfs_super(sb); | |
68 | struct logfs_area *area; | |
69 | void *reserved; | |
70 | int i; | |
71 | ||
72 | /* Some segments are reserved. Just pretend they were all valid */ | |
73 | reserved = btree_lookup32(&super->s_reserved_segments, segno); | |
74 | if (reserved) | |
75 | return 1; | |
76 | ||
77 | /* Currently open segments */ | |
78 | for_each_area(i) { | |
79 | area = super->s_area[i]; | |
80 | if (area->a_is_open && area->a_segno == segno) | |
81 | return 1; | |
82 | } | |
83 | ||
84 | return 0; | |
85 | } | |
86 | ||
87 | static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) | |
88 | { | |
89 | BUG(); | |
90 | } | |
91 | ||
92 | /* | |
93 | * Returns the bytes consumed by valid objects in this segment. Object headers | |
94 | * are counted, the segment header is not. | |
95 | */ | |
96 | static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, | |
97 | gc_level_t *gc_level) | |
98 | { | |
99 | struct logfs_segment_entry se; | |
100 | u32 ec_level; | |
101 | ||
102 | logfs_get_segment_entry(sb, segno, &se); | |
103 | if (se.ec_level == cpu_to_be32(BADSEG) || | |
104 | se.valid == cpu_to_be32(RESERVED)) | |
105 | return RESERVED; | |
106 | ||
107 | ec_level = be32_to_cpu(se.ec_level); | |
108 | *ec = ec_level >> 4; | |
109 | *gc_level = GC_LEVEL(ec_level & 0xf); | |
110 | return be32_to_cpu(se.valid); | |
111 | } | |
112 | ||
113 | static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, | |
114 | u64 bix, gc_level_t gc_level) | |
115 | { | |
116 | struct inode *inode; | |
117 | int err, cookie; | |
118 | ||
119 | inode = logfs_safe_iget(sb, ino, &cookie); | |
120 | err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); | |
121 | BUG_ON(err); | |
122 | logfs_safe_iput(inode, cookie); | |
123 | } | |
124 | ||
6f485b41 | 125 | static u32 logfs_gc_segment(struct super_block *sb, u32 segno) |
5db53f3e JE |
126 | { |
127 | struct logfs_super *super = logfs_super(sb); | |
128 | struct logfs_segment_header sh; | |
129 | struct logfs_object_header oh; | |
130 | u64 ofs, ino, bix; | |
131 | u32 seg_ofs, logical_segno, cleaned = 0; | |
132 | int err, len, valid; | |
133 | gc_level_t gc_level; | |
134 | ||
135 | LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); | |
136 | ||
137 | btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); | |
138 | err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); | |
139 | BUG_ON(err); | |
140 | gc_level = GC_LEVEL(sh.level); | |
141 | logical_segno = be32_to_cpu(sh.segno); | |
142 | if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { | |
143 | logfs_mark_segment_bad(sb, segno); | |
144 | cleaned = -1; | |
145 | goto out; | |
146 | } | |
147 | ||
148 | for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; | |
149 | seg_ofs + sizeof(oh) < super->s_segsize; ) { | |
150 | ofs = dev_ofs(sb, logical_segno, seg_ofs); | |
151 | err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), | |
152 | &oh); | |
153 | BUG_ON(err); | |
154 | ||
155 | if (!memchr_inv(&oh, 0xff, sizeof(oh))) | |
156 | break; | |
157 | ||
158 | if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { | |
159 | logfs_mark_segment_bad(sb, segno); | |
160 | cleaned = super->s_segsize - 1; | |
161 | goto out; | |
162 | } | |
163 | ||
164 | ino = be64_to_cpu(oh.ino); | |
165 | bix = be64_to_cpu(oh.bix); | |
166 | len = sizeof(oh) + be16_to_cpu(oh.len); | |
167 | valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); | |
168 | if (valid == 1) { | |
169 | logfs_cleanse_block(sb, ofs, ino, bix, gc_level); | |
170 | cleaned += len; | |
171 | } else if (valid == 2) { | |
172 | /* Will be invalid upon journal commit */ | |
173 | cleaned += len; | |
174 | } | |
175 | seg_ofs += len; | |
176 | } | |
177 | out: | |
178 | btree_remove32(&super->s_reserved_segments, segno); | |
179 | return cleaned; | |
180 | } | |
181 | ||
182 | static struct gc_candidate *add_list(struct gc_candidate *cand, | |
183 | struct candidate_list *list) | |
184 | { | |
185 | struct rb_node **p = &list->rb_tree.rb_node; | |
186 | struct rb_node *parent = NULL; | |
187 | struct gc_candidate *cur; | |
188 | int comp; | |
189 | ||
190 | cand->list = list; | |
191 | while (*p) { | |
192 | parent = *p; | |
193 | cur = rb_entry(parent, struct gc_candidate, rb_node); | |
194 | ||
195 | if (list->sort_by_ec) | |
196 | comp = cand->erase_count < cur->erase_count; | |
197 | else | |
198 | comp = cand->valid < cur->valid; | |
199 | ||
200 | if (comp) | |
201 | p = &parent->rb_left; | |
202 | else | |
203 | p = &parent->rb_right; | |
204 | } | |
205 | rb_link_node(&cand->rb_node, parent, p); | |
206 | rb_insert_color(&cand->rb_node, &list->rb_tree); | |
207 | ||
208 | if (list->count <= list->maxcount) { | |
209 | list->count++; | |
210 | return NULL; | |
211 | } | |
212 | cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); | |
213 | rb_erase(&cand->rb_node, &list->rb_tree); | |
214 | cand->list = NULL; | |
215 | return cand; | |
216 | } | |
217 | ||
218 | static void remove_from_list(struct gc_candidate *cand) | |
219 | { | |
220 | struct candidate_list *list = cand->list; | |
221 | ||
222 | rb_erase(&cand->rb_node, &list->rb_tree); | |
223 | list->count--; | |
224 | } | |
225 | ||
226 | static void free_candidate(struct super_block *sb, struct gc_candidate *cand) | |
227 | { | |
228 | struct logfs_super *super = logfs_super(sb); | |
229 | ||
230 | btree_remove32(&super->s_cand_tree, cand->segno); | |
231 | kfree(cand); | |
232 | } | |
233 | ||
234 | u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) | |
235 | { | |
236 | struct gc_candidate *cand; | |
237 | u32 segno; | |
238 | ||
239 | BUG_ON(list->count == 0); | |
240 | ||
241 | cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | |
242 | remove_from_list(cand); | |
243 | segno = cand->segno; | |
244 | if (ec) | |
245 | *ec = cand->erase_count; | |
246 | free_candidate(sb, cand); | |
247 | return segno; | |
248 | } | |
249 | ||
250 | /* | |
251 | * We have several lists to manage segments with. The reserve_list is used to | |
252 | * deal with bad blocks. We try to keep the best (lowest ec) segments on this | |
253 | * list. | |
254 | * The free_list contains free segments for normal usage. It usually gets the | |
255 | * second pick after the reserve_list. But when the free_list is running short | |
256 | * it is more important to keep the free_list full than to keep a reserve. | |
257 | * | |
258 | * Segments that are not free are put onto a per-level low_list. If we have | |
259 | * to run garbage collection, we pick a candidate from there. All segments on | |
260 | * those lists should have at least some free space so GC will make progress. | |
261 | * | |
262 | * And last we have the ec_list, which is used to pick segments for wear | |
263 | * leveling. | |
264 | * | |
265 | * If all appropriate lists are full, we simply free the candidate and forget | |
266 | * about that segment for a while. We have better candidates for each purpose. | |
267 | */ | |
268 | static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) | |
269 | { | |
270 | struct logfs_super *super = logfs_super(sb); | |
271 | u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; | |
272 | ||
273 | if (cand->valid == 0) { | |
274 | /* 100% free segments */ | |
275 | log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", | |
276 | cand->segno, cand->erase_count, | |
277 | dev_ofs(sb, cand->segno, 0)); | |
278 | cand = add_list(cand, &super->s_reserve_list); | |
279 | if (cand) { | |
280 | log_gc_noisy("add free segment %x (ec %x) at %llx\n", | |
281 | cand->segno, cand->erase_count, | |
282 | dev_ofs(sb, cand->segno, 0)); | |
283 | cand = add_list(cand, &super->s_free_list); | |
284 | } | |
285 | } else { | |
286 | /* good candidates for Garbage Collection */ | |
287 | if (cand->valid < full) | |
288 | cand = add_list(cand, &super->s_low_list[cand->dist]); | |
289 | /* good candidates for wear leveling, | |
290 | * segments that were recently written get ignored */ | |
291 | if (cand) | |
292 | cand = add_list(cand, &super->s_ec_list); | |
293 | } | |
294 | if (cand) | |
295 | free_candidate(sb, cand); | |
296 | } | |
297 | ||
298 | static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, | |
299 | u8 dist) | |
300 | { | |
301 | struct logfs_super *super = logfs_super(sb); | |
302 | struct gc_candidate *cand; | |
303 | ||
304 | cand = kmalloc(sizeof(*cand), GFP_NOFS); | |
305 | if (!cand) | |
306 | return -ENOMEM; | |
307 | ||
308 | cand->segno = segno; | |
309 | cand->valid = valid; | |
310 | cand->erase_count = ec; | |
311 | cand->dist = dist; | |
312 | ||
313 | btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); | |
314 | __add_candidate(sb, cand); | |
315 | return 0; | |
316 | } | |
317 | ||
318 | static void remove_segment_from_lists(struct super_block *sb, u32 segno) | |
319 | { | |
320 | struct logfs_super *super = logfs_super(sb); | |
321 | struct gc_candidate *cand; | |
322 | ||
323 | cand = btree_lookup32(&super->s_cand_tree, segno); | |
324 | if (cand) { | |
325 | remove_from_list(cand); | |
326 | free_candidate(sb, cand); | |
327 | } | |
328 | } | |
329 | ||
330 | static void scan_segment(struct super_block *sb, u32 segno) | |
331 | { | |
332 | u32 valid, ec = 0; | |
333 | gc_level_t gc_level = 0; | |
334 | u8 dist; | |
335 | ||
336 | if (segment_is_reserved(sb, segno)) | |
337 | return; | |
338 | ||
339 | remove_segment_from_lists(sb, segno); | |
340 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | |
341 | if (valid == RESERVED) | |
342 | return; | |
343 | ||
344 | dist = root_distance(sb, gc_level); | |
345 | add_candidate(sb, segno, valid, ec, dist); | |
346 | } | |
347 | ||
348 | static struct gc_candidate *first_in_list(struct candidate_list *list) | |
349 | { | |
350 | if (list->count == 0) | |
351 | return NULL; | |
352 | return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | |
353 | } | |
354 | ||
355 | /* | |
356 | * Find the best segment for garbage collection. Main criterion is | |
357 | * the segment requiring the least effort to clean. Secondary | |
358 | * criterion is to GC on the lowest level available. | |
359 | * | |
360 | * So we search the least effort segment on the lowest level first, | |
361 | * then move up and pick another segment iff is requires significantly | |
362 | * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. | |
363 | */ | |
364 | static struct gc_candidate *get_candidate(struct super_block *sb) | |
365 | { | |
366 | struct logfs_super *super = logfs_super(sb); | |
367 | int i, max_dist; | |
368 | struct gc_candidate *cand = NULL, *this; | |
369 | ||
934eed39 | 370 | max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1); |
5db53f3e JE |
371 | |
372 | for (i = max_dist; i >= 0; i--) { | |
373 | this = first_in_list(&super->s_low_list[i]); | |
374 | if (!this) | |
375 | continue; | |
376 | if (!cand) | |
377 | cand = this; | |
378 | if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) | |
379 | cand = this; | |
380 | } | |
381 | return cand; | |
382 | } | |
383 | ||
384 | static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) | |
385 | { | |
386 | struct logfs_super *super = logfs_super(sb); | |
387 | gc_level_t gc_level; | |
388 | u32 cleaned, valid, segno, ec; | |
389 | u8 dist; | |
390 | ||
391 | if (!cand) { | |
392 | log_gc("GC attempted, but no candidate found\n"); | |
393 | return 0; | |
394 | } | |
395 | ||
396 | segno = cand->segno; | |
397 | dist = cand->dist; | |
398 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | |
399 | free_candidate(sb, cand); | |
400 | log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", | |
401 | segno, (u64)segno << super->s_segshift, | |
402 | dist, no_free_segments(sb), valid, | |
403 | super->s_free_bytes); | |
6f485b41 | 404 | cleaned = logfs_gc_segment(sb, segno); |
5db53f3e JE |
405 | log_gc("GC segment #%02x complete - now %x valid\n", segno, |
406 | valid - cleaned); | |
407 | BUG_ON(cleaned != valid); | |
408 | return 1; | |
409 | } | |
410 | ||
411 | static int logfs_gc_once(struct super_block *sb) | |
412 | { | |
413 | struct gc_candidate *cand; | |
414 | ||
415 | cand = get_candidate(sb); | |
416 | if (cand) | |
417 | remove_from_list(cand); | |
418 | return __logfs_gc_once(sb, cand); | |
419 | } | |
420 | ||
421 | /* returns 1 if a wrap occurs, 0 otherwise */ | |
422 | static int logfs_scan_some(struct super_block *sb) | |
423 | { | |
424 | struct logfs_super *super = logfs_super(sb); | |
425 | u32 segno; | |
426 | int i, ret = 0; | |
427 | ||
428 | segno = super->s_sweeper; | |
429 | for (i = SCAN_RATIO; i > 0; i--) { | |
430 | segno++; | |
431 | if (segno >= super->s_no_segs) { | |
432 | segno = 0; | |
433 | ret = 1; | |
434 | /* Break out of the loop. We want to read a single | |
435 | * block from the segment size on next invocation if | |
436 | * SCAN_RATIO is set to match block size | |
437 | */ | |
438 | break; | |
439 | } | |
440 | ||
441 | scan_segment(sb, segno); | |
442 | } | |
443 | super->s_sweeper = segno; | |
444 | return ret; | |
445 | } | |
446 | ||
447 | /* | |
448 | * In principle, this function should loop forever, looking for GC candidates | |
449 | * and moving data. LogFS is designed in such a way that this loop is | |
450 | * guaranteed to terminate. | |
451 | * | |
452 | * Limiting the loop to some iterations serves purely to catch cases when | |
453 | * these guarantees have failed. An actual endless loop is an obvious bug | |
454 | * and should be reported as such. | |
455 | */ | |
456 | static void __logfs_gc_pass(struct super_block *sb, int target) | |
457 | { | |
458 | struct logfs_super *super = logfs_super(sb); | |
459 | struct logfs_block *block; | |
460 | int round, progress, last_progress = 0; | |
461 | ||
032d8f72 JE |
462 | /* |
463 | * Doing too many changes to the segfile at once would result | |
464 | * in a large number of aliases. Write the journal before | |
465 | * things get out of hand. | |
466 | */ | |
467 | if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) | |
468 | logfs_write_anchor(sb); | |
469 | ||
5db53f3e JE |
470 | if (no_free_segments(sb) >= target && |
471 | super->s_no_object_aliases < MAX_OBJ_ALIASES) | |
472 | return; | |
473 | ||
474 | log_gc("__logfs_gc_pass(%x)\n", target); | |
475 | for (round = 0; round < SCAN_ROUNDS; ) { | |
476 | if (no_free_segments(sb) >= target) | |
477 | goto write_alias; | |
478 | ||
479 | /* Sync in-memory state with on-medium state in case they | |
480 | * diverged */ | |
c6d38301 | 481 | logfs_write_anchor(sb); |
5db53f3e JE |
482 | round += logfs_scan_some(sb); |
483 | if (no_free_segments(sb) >= target) | |
484 | goto write_alias; | |
485 | progress = logfs_gc_once(sb); | |
486 | if (progress) | |
487 | last_progress = round; | |
488 | else if (round - last_progress > 2) | |
489 | break; | |
490 | continue; | |
491 | ||
492 | /* | |
493 | * The goto logic is nasty, I just don't know a better way to | |
494 | * code it. GC is supposed to ensure two things: | |
495 | * 1. Enough free segments are available. | |
496 | * 2. The number of aliases is bounded. | |
497 | * When 1. is achieved, we take a look at 2. and write back | |
498 | * some alias-containing blocks, if necessary. However, after | |
499 | * each such write we need to go back to 1., as writes can | |
500 | * consume free segments. | |
501 | */ | |
502 | write_alias: | |
503 | if (super->s_no_object_aliases < MAX_OBJ_ALIASES) | |
504 | return; | |
505 | if (list_empty(&super->s_object_alias)) { | |
506 | /* All aliases are still in btree */ | |
507 | return; | |
508 | } | |
509 | log_gc("Write back one alias\n"); | |
510 | block = list_entry(super->s_object_alias.next, | |
511 | struct logfs_block, alias_list); | |
512 | block->ops->write_block(block); | |
513 | /* | |
514 | * To round off the nasty goto logic, we reset round here. It | |
515 | * is a safety-net for GC not making any progress and limited | |
516 | * to something reasonably small. If incremented it for every | |
517 | * single alias, the loop could terminate rather quickly. | |
518 | */ | |
519 | round = 0; | |
520 | } | |
521 | LOGFS_BUG(sb); | |
522 | } | |
523 | ||
524 | static int wl_ratelimit(struct super_block *sb, u64 *next_event) | |
525 | { | |
526 | struct logfs_super *super = logfs_super(sb); | |
527 | ||
528 | if (*next_event < super->s_gec) { | |
529 | *next_event = super->s_gec + WL_RATELIMIT; | |
530 | return 0; | |
531 | } | |
532 | return 1; | |
533 | } | |
534 | ||
535 | static void logfs_wl_pass(struct super_block *sb) | |
536 | { | |
537 | struct logfs_super *super = logfs_super(sb); | |
538 | struct gc_candidate *wl_cand, *free_cand; | |
539 | ||
540 | if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) | |
541 | return; | |
542 | ||
543 | wl_cand = first_in_list(&super->s_ec_list); | |
544 | if (!wl_cand) | |
545 | return; | |
546 | free_cand = first_in_list(&super->s_free_list); | |
547 | if (!free_cand) | |
548 | return; | |
549 | ||
550 | if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { | |
551 | remove_from_list(wl_cand); | |
552 | __logfs_gc_once(sb, wl_cand); | |
553 | } | |
554 | } | |
555 | ||
556 | /* | |
557 | * The journal needs wear leveling as well. But moving the journal is an | |
558 | * expensive operation so we try to avoid it as much as possible. And if we | |
559 | * have to do it, we move the whole journal, not individual segments. | |
560 | * | |
561 | * Ratelimiting is not strictly necessary here, it mainly serves to avoid the | |
562 | * calculations. First we check whether moving the journal would be a | |
563 | * significant improvement. That means that a) the current journal segments | |
564 | * have more wear than the future journal segments and b) the current journal | |
565 | * segments have more wear than normal ostore segments. | |
566 | * Rationale for b) is that we don't have to move the journal if it is aging | |
567 | * less than the ostore, even if the reserve segments age even less (they are | |
568 | * excluded from wear leveling, after all). | |
569 | * Next we check that the superblocks have less wear than the journal. Since | |
570 | * moving the journal requires writing the superblocks, we have to protect the | |
571 | * superblocks even more than the journal. | |
572 | * | |
573 | * Also we double the acceptable wear difference, compared to ostore wear | |
574 | * leveling. Journal data is read and rewritten rapidly, comparatively. So | |
575 | * soft errors have much less time to accumulate and we allow the journal to | |
576 | * be a bit worse than the ostore. | |
577 | */ | |
578 | static void logfs_journal_wl_pass(struct super_block *sb) | |
579 | { | |
580 | struct logfs_super *super = logfs_super(sb); | |
581 | struct gc_candidate *cand; | |
582 | u32 min_journal_ec = -1, max_reserve_ec = 0; | |
583 | int i; | |
584 | ||
585 | if (wl_ratelimit(sb, &super->s_wl_gec_journal)) | |
586 | return; | |
587 | ||
588 | if (super->s_reserve_list.count < super->s_no_journal_segs) { | |
589 | /* Reserve is not full enough to move complete journal */ | |
590 | return; | |
591 | } | |
592 | ||
593 | journal_for_each(i) | |
594 | if (super->s_journal_seg[i]) | |
595 | min_journal_ec = min(min_journal_ec, | |
596 | super->s_journal_ec[i]); | |
597 | cand = rb_entry(rb_first(&super->s_free_list.rb_tree), | |
598 | struct gc_candidate, rb_node); | |
599 | max_reserve_ec = cand->erase_count; | |
600 | for (i = 0; i < 2; i++) { | |
601 | struct logfs_segment_entry se; | |
602 | u32 segno = seg_no(sb, super->s_sb_ofs[i]); | |
603 | u32 ec; | |
604 | ||
605 | logfs_get_segment_entry(sb, segno, &se); | |
606 | ec = be32_to_cpu(se.ec_level) >> 4; | |
607 | max_reserve_ec = max(max_reserve_ec, ec); | |
608 | } | |
609 | ||
610 | if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { | |
611 | do_logfs_journal_wl_pass(sb); | |
612 | } | |
613 | } | |
614 | ||
615 | void logfs_gc_pass(struct super_block *sb) | |
616 | { | |
617 | struct logfs_super *super = logfs_super(sb); | |
618 | ||
619 | //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); | |
620 | /* Write journal before free space is getting saturated with dirty | |
621 | * objects. | |
622 | */ | |
623 | if (super->s_dirty_used_bytes + super->s_dirty_free_bytes | |
624 | + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) | |
c6d38301 JE |
625 | logfs_write_anchor(sb); |
626 | __logfs_gc_pass(sb, super->s_total_levels); | |
5db53f3e JE |
627 | logfs_wl_pass(sb); |
628 | logfs_journal_wl_pass(sb); | |
629 | } | |
630 | ||
631 | static int check_area(struct super_block *sb, int i) | |
632 | { | |
633 | struct logfs_super *super = logfs_super(sb); | |
634 | struct logfs_area *area = super->s_area[i]; | |
6f485b41 JE |
635 | gc_level_t gc_level; |
636 | u32 cleaned, valid, ec; | |
5db53f3e | 637 | u32 segno = area->a_segno; |
6f485b41 | 638 | u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); |
5db53f3e JE |
639 | |
640 | if (!area->a_is_open) | |
641 | return 0; | |
642 | ||
6f485b41 JE |
643 | if (super->s_devops->can_write_buf(sb, ofs) == 0) |
644 | return 0; | |
5db53f3e | 645 | |
6f485b41 JE |
646 | printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs); |
647 | /* | |
648 | * The device cannot write back the write buffer. Most likely the | |
649 | * wbuf was already written out and the system crashed at some point | |
650 | * before the journal commit happened. In that case we wouldn't have | |
651 | * to do anything. But if the crash happened before the wbuf was | |
652 | * written out correctly, we must GC this segment. So assume the | |
653 | * worst and always do the GC run. | |
654 | */ | |
655 | area->a_is_open = 0; | |
656 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | |
657 | cleaned = logfs_gc_segment(sb, segno); | |
658 | if (cleaned != valid) | |
659 | return -EIO; | |
5db53f3e JE |
660 | return 0; |
661 | } | |
662 | ||
663 | int logfs_check_areas(struct super_block *sb) | |
664 | { | |
665 | int i, err; | |
666 | ||
667 | for_each_area(i) { | |
668 | err = check_area(sb, i); | |
669 | if (err) | |
670 | return err; | |
671 | } | |
672 | return 0; | |
673 | } | |
674 | ||
675 | static void logfs_init_candlist(struct candidate_list *list, int maxcount, | |
676 | int sort_by_ec) | |
677 | { | |
678 | list->count = 0; | |
679 | list->maxcount = maxcount; | |
680 | list->sort_by_ec = sort_by_ec; | |
681 | list->rb_tree = RB_ROOT; | |
682 | } | |
683 | ||
684 | int logfs_init_gc(struct super_block *sb) | |
685 | { | |
686 | struct logfs_super *super = logfs_super(sb); | |
687 | int i; | |
688 | ||
689 | btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); | |
690 | logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); | |
691 | logfs_init_candlist(&super->s_reserve_list, | |
692 | super->s_bad_seg_reserve, 1); | |
693 | for_each_area(i) | |
694 | logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); | |
695 | logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); | |
696 | return 0; | |
697 | } | |
698 | ||
699 | static void logfs_cleanup_list(struct super_block *sb, | |
700 | struct candidate_list *list) | |
701 | { | |
702 | struct gc_candidate *cand; | |
703 | ||
704 | while (list->count) { | |
705 | cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, | |
706 | rb_node); | |
707 | remove_from_list(cand); | |
708 | free_candidate(sb, cand); | |
709 | } | |
710 | BUG_ON(list->rb_tree.rb_node); | |
711 | } | |
712 | ||
713 | void logfs_cleanup_gc(struct super_block *sb) | |
714 | { | |
715 | struct logfs_super *super = logfs_super(sb); | |
716 | int i; | |
717 | ||
718 | if (!super->s_free_list.count) | |
719 | return; | |
720 | ||
721 | /* | |
722 | * FIXME: The btree may still contain a single empty node. So we | |
723 | * call the grim visitor to clean up that mess. Btree code should | |
724 | * do it for us, really. | |
725 | */ | |
726 | btree_grim_visitor32(&super->s_cand_tree, 0, NULL); | |
727 | logfs_cleanup_list(sb, &super->s_free_list); | |
728 | logfs_cleanup_list(sb, &super->s_reserve_list); | |
729 | for_each_area(i) | |
730 | logfs_cleanup_list(sb, &super->s_low_list[i]); | |
731 | logfs_cleanup_list(sb, &super->s_ec_list); | |
732 | } |