]>
Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | /* |
2 | * background writeback - scan btree for dirty data and write it to the backing | |
3 | * device | |
4 | * | |
5 | * Copyright 2010, 2011 Kent Overstreet <[email protected]> | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcache.h" | |
10 | #include "btree.h" | |
11 | #include "debug.h" | |
279afbad | 12 | #include "writeback.h" |
cafe5635 | 13 | |
c37511b8 KO |
14 | #include <trace/events/bcache.h> |
15 | ||
cafe5635 KO |
16 | static struct workqueue_struct *dirty_wq; |
17 | ||
18 | static void read_dirty(struct closure *); | |
19 | ||
20 | struct dirty_io { | |
21 | struct closure cl; | |
22 | struct cached_dev *dc; | |
23 | struct bio bio; | |
24 | }; | |
25 | ||
26 | /* Rate limiting */ | |
27 | ||
28 | static void __update_writeback_rate(struct cached_dev *dc) | |
29 | { | |
30 | struct cache_set *c = dc->disk.c; | |
31 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | |
32 | uint64_t cache_dirty_target = | |
33 | div_u64(cache_sectors * dc->writeback_percent, 100); | |
34 | ||
35 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | |
36 | c->cached_dev_sectors); | |
37 | ||
38 | /* PD controller */ | |
39 | ||
40 | int change = 0; | |
41 | int64_t error; | |
279afbad | 42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
cafe5635 KO |
43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
44 | ||
45 | dc->disk.sectors_dirty_last = dirty; | |
46 | ||
47 | derivative *= dc->writeback_rate_d_term; | |
48 | derivative = clamp(derivative, -dirty, dirty); | |
49 | ||
50 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | |
51 | dc->writeback_rate_d_smooth, 0); | |
52 | ||
53 | /* Avoid divide by zero */ | |
54 | if (!target) | |
55 | goto out; | |
56 | ||
57 | error = div64_s64((dirty + derivative - target) << 8, target); | |
58 | ||
59 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | |
60 | dc->writeback_rate_p_term_inverse); | |
61 | ||
62 | /* Don't increase writeback rate if the device isn't keeping up */ | |
63 | if (change > 0 && | |
64 | time_after64(local_clock(), | |
65 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | |
66 | change = 0; | |
67 | ||
68 | dc->writeback_rate.rate = | |
69 | clamp_t(int64_t, dc->writeback_rate.rate + change, | |
70 | 1, NSEC_PER_MSEC); | |
71 | out: | |
72 | dc->writeback_rate_derivative = derivative; | |
73 | dc->writeback_rate_change = change; | |
74 | dc->writeback_rate_target = target; | |
75 | ||
76 | schedule_delayed_work(&dc->writeback_rate_update, | |
77 | dc->writeback_rate_update_seconds * HZ); | |
78 | } | |
79 | ||
80 | static void update_writeback_rate(struct work_struct *work) | |
81 | { | |
82 | struct cached_dev *dc = container_of(to_delayed_work(work), | |
83 | struct cached_dev, | |
84 | writeback_rate_update); | |
85 | ||
86 | down_read(&dc->writeback_lock); | |
87 | ||
88 | if (atomic_read(&dc->has_dirty) && | |
89 | dc->writeback_percent) | |
90 | __update_writeback_rate(dc); | |
91 | ||
92 | up_read(&dc->writeback_lock); | |
93 | } | |
94 | ||
95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | |
96 | { | |
97 | if (atomic_read(&dc->disk.detaching) || | |
98 | !dc->writeback_percent) | |
99 | return 0; | |
100 | ||
169ef1cf | 101 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
cafe5635 KO |
102 | } |
103 | ||
104 | /* Background writeback */ | |
105 | ||
106 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |
107 | { | |
108 | return KEY_DIRTY(k); | |
109 | } | |
110 | ||
72c27061 KO |
111 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) |
112 | { | |
113 | uint64_t stripe; | |
114 | unsigned nr_sectors = KEY_SIZE(k); | |
115 | struct cached_dev *dc = container_of(buf, struct cached_dev, | |
116 | writeback_keys); | |
117 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; | |
118 | ||
119 | if (!KEY_DIRTY(k)) | |
120 | return false; | |
121 | ||
122 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; | |
123 | while (1) { | |
124 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != | |
125 | stripe_size) | |
126 | return false; | |
127 | ||
128 | if (nr_sectors <= stripe_size) | |
129 | return true; | |
130 | ||
131 | nr_sectors -= stripe_size; | |
132 | stripe++; | |
133 | } | |
134 | } | |
135 | ||
cafe5635 KO |
136 | static void dirty_init(struct keybuf_key *w) |
137 | { | |
138 | struct dirty_io *io = w->private; | |
139 | struct bio *bio = &io->bio; | |
140 | ||
141 | bio_init(bio); | |
142 | if (!io->dc->writeback_percent) | |
143 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | |
144 | ||
145 | bio->bi_size = KEY_SIZE(&w->key) << 9; | |
146 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | |
147 | bio->bi_private = w; | |
148 | bio->bi_io_vec = bio->bi_inline_vecs; | |
169ef1cf | 149 | bch_bio_map(bio, NULL); |
cafe5635 KO |
150 | } |
151 | ||
152 | static void refill_dirty(struct closure *cl) | |
153 | { | |
154 | struct cached_dev *dc = container_of(cl, struct cached_dev, | |
155 | writeback.cl); | |
156 | struct keybuf *buf = &dc->writeback_keys; | |
157 | bool searched_from_start = false; | |
158 | struct bkey end = MAX_KEY; | |
159 | SET_KEY_INODE(&end, dc->disk.id); | |
160 | ||
161 | if (!atomic_read(&dc->disk.detaching) && | |
162 | !dc->writeback_running) | |
163 | closure_return(cl); | |
164 | ||
165 | down_write(&dc->writeback_lock); | |
166 | ||
167 | if (!atomic_read(&dc->has_dirty)) { | |
168 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | |
169 | bch_write_bdev_super(dc, NULL); | |
170 | ||
171 | up_write(&dc->writeback_lock); | |
172 | closure_return(cl); | |
173 | } | |
174 | ||
175 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | |
176 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | |
177 | searched_from_start = true; | |
178 | } | |
179 | ||
72c27061 KO |
180 | if (dc->partial_stripes_expensive) { |
181 | uint64_t i; | |
182 | ||
183 | for (i = 0; i < dc->disk.nr_stripes; i++) | |
184 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | |
185 | 1 << dc->disk.stripe_size_bits) | |
186 | goto full_stripes; | |
187 | ||
188 | goto normal_refill; | |
189 | full_stripes: | |
190 | bch_refill_keybuf(dc->disk.c, buf, &end, | |
191 | dirty_full_stripe_pred); | |
192 | } else { | |
193 | normal_refill: | |
194 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | |
195 | } | |
cafe5635 KO |
196 | |
197 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | |
198 | /* Searched the entire btree - delay awhile */ | |
199 | ||
200 | if (RB_EMPTY_ROOT(&buf->keys)) { | |
201 | atomic_set(&dc->has_dirty, 0); | |
202 | cached_dev_put(dc); | |
203 | } | |
204 | ||
205 | if (!atomic_read(&dc->disk.detaching)) | |
206 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | |
207 | } | |
208 | ||
209 | up_write(&dc->writeback_lock); | |
210 | ||
211 | ratelimit_reset(&dc->writeback_rate); | |
212 | ||
213 | /* Punt to workqueue only so we don't recurse and blow the stack */ | |
214 | continue_at(cl, read_dirty, dirty_wq); | |
215 | } | |
216 | ||
217 | void bch_writeback_queue(struct cached_dev *dc) | |
218 | { | |
219 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | |
220 | if (!atomic_read(&dc->disk.detaching)) | |
221 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | |
222 | ||
223 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | |
224 | } | |
225 | } | |
226 | ||
279afbad | 227 | void bch_writeback_add(struct cached_dev *dc) |
cafe5635 | 228 | { |
cafe5635 KO |
229 | if (!atomic_read(&dc->has_dirty) && |
230 | !atomic_xchg(&dc->has_dirty, 1)) { | |
231 | atomic_inc(&dc->count); | |
232 | ||
233 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | |
234 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | |
235 | /* XXX: should do this synchronously */ | |
236 | bch_write_bdev_super(dc, NULL); | |
237 | } | |
238 | ||
239 | bch_writeback_queue(dc); | |
240 | ||
241 | if (dc->writeback_percent) | |
242 | schedule_delayed_work(&dc->writeback_rate_update, | |
243 | dc->writeback_rate_update_seconds * HZ); | |
244 | } | |
245 | } | |
246 | ||
279afbad KO |
247 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, |
248 | uint64_t offset, int nr_sectors) | |
249 | { | |
250 | struct bcache_device *d = c->devices[inode]; | |
251 | unsigned stripe_size, stripe_offset; | |
252 | uint64_t stripe; | |
253 | ||
254 | if (!d) | |
255 | return; | |
256 | ||
257 | stripe_size = 1 << d->stripe_size_bits; | |
258 | stripe = offset >> d->stripe_size_bits; | |
259 | stripe_offset = offset & (stripe_size - 1); | |
260 | ||
261 | while (nr_sectors) { | |
262 | int s = min_t(unsigned, abs(nr_sectors), | |
263 | stripe_size - stripe_offset); | |
264 | ||
265 | if (nr_sectors < 0) | |
266 | s = -s; | |
267 | ||
268 | atomic_add(s, d->stripe_sectors_dirty + stripe); | |
269 | nr_sectors -= s; | |
270 | stripe_offset = 0; | |
271 | stripe++; | |
272 | } | |
273 | } | |
274 | ||
cafe5635 KO |
275 | /* Background writeback - IO loop */ |
276 | ||
277 | static void dirty_io_destructor(struct closure *cl) | |
278 | { | |
279 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
280 | kfree(io); | |
281 | } | |
282 | ||
283 | static void write_dirty_finish(struct closure *cl) | |
284 | { | |
285 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
286 | struct keybuf_key *w = io->bio.bi_private; | |
287 | struct cached_dev *dc = io->dc; | |
8e51e414 KO |
288 | struct bio_vec *bv; |
289 | int i; | |
cafe5635 | 290 | |
8e51e414 | 291 | bio_for_each_segment_all(bv, &io->bio, i) |
cafe5635 KO |
292 | __free_page(bv->bv_page); |
293 | ||
294 | /* This is kind of a dumb way of signalling errors. */ | |
295 | if (KEY_DIRTY(&w->key)) { | |
296 | unsigned i; | |
297 | struct btree_op op; | |
298 | bch_btree_op_init_stack(&op); | |
299 | ||
300 | op.type = BTREE_REPLACE; | |
301 | bkey_copy(&op.replace, &w->key); | |
302 | ||
303 | SET_KEY_DIRTY(&w->key, false); | |
304 | bch_keylist_add(&op.keys, &w->key); | |
305 | ||
306 | for (i = 0; i < KEY_PTRS(&w->key); i++) | |
307 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | |
308 | ||
cafe5635 KO |
309 | bch_btree_insert(&op, dc->disk.c); |
310 | closure_sync(&op.cl); | |
311 | ||
c37511b8 KO |
312 | if (op.insert_collision) |
313 | trace_bcache_writeback_collision(&w->key); | |
314 | ||
cafe5635 KO |
315 | atomic_long_inc(op.insert_collision |
316 | ? &dc->disk.c->writeback_keys_failed | |
317 | : &dc->disk.c->writeback_keys_done); | |
318 | } | |
319 | ||
320 | bch_keybuf_del(&dc->writeback_keys, w); | |
321 | atomic_dec_bug(&dc->in_flight); | |
322 | ||
323 | closure_wake_up(&dc->writeback_wait); | |
324 | ||
325 | closure_return_with_destructor(cl, dirty_io_destructor); | |
326 | } | |
327 | ||
328 | static void dirty_endio(struct bio *bio, int error) | |
329 | { | |
330 | struct keybuf_key *w = bio->bi_private; | |
331 | struct dirty_io *io = w->private; | |
332 | ||
333 | if (error) | |
334 | SET_KEY_DIRTY(&w->key, false); | |
335 | ||
336 | closure_put(&io->cl); | |
337 | } | |
338 | ||
339 | static void write_dirty(struct closure *cl) | |
340 | { | |
341 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
342 | struct keybuf_key *w = io->bio.bi_private; | |
343 | ||
344 | dirty_init(w); | |
345 | io->bio.bi_rw = WRITE; | |
346 | io->bio.bi_sector = KEY_START(&w->key); | |
347 | io->bio.bi_bdev = io->dc->bdev; | |
348 | io->bio.bi_end_io = dirty_endio; | |
349 | ||
cafe5635 KO |
350 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
351 | ||
352 | continue_at(cl, write_dirty_finish, dirty_wq); | |
353 | } | |
354 | ||
355 | static void read_dirty_endio(struct bio *bio, int error) | |
356 | { | |
357 | struct keybuf_key *w = bio->bi_private; | |
358 | struct dirty_io *io = w->private; | |
359 | ||
360 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | |
361 | error, "reading dirty data from cache"); | |
362 | ||
363 | dirty_endio(bio, error); | |
364 | } | |
365 | ||
366 | static void read_dirty_submit(struct closure *cl) | |
367 | { | |
368 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
369 | ||
cafe5635 KO |
370 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
371 | ||
372 | continue_at(cl, write_dirty, dirty_wq); | |
373 | } | |
374 | ||
375 | static void read_dirty(struct closure *cl) | |
376 | { | |
377 | struct cached_dev *dc = container_of(cl, struct cached_dev, | |
378 | writeback.cl); | |
379 | unsigned delay = writeback_delay(dc, 0); | |
380 | struct keybuf_key *w; | |
381 | struct dirty_io *io; | |
382 | ||
383 | /* | |
384 | * XXX: if we error, background writeback just spins. Should use some | |
385 | * mempools. | |
386 | */ | |
387 | ||
388 | while (1) { | |
389 | w = bch_keybuf_next(&dc->writeback_keys); | |
390 | if (!w) | |
391 | break; | |
392 | ||
393 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | |
394 | ||
395 | if (delay > 0 && | |
396 | (KEY_START(&w->key) != dc->last_read || | |
397 | jiffies_to_msecs(delay) > 50)) { | |
398 | w->private = NULL; | |
399 | ||
400 | closure_delay(&dc->writeback, delay); | |
401 | continue_at(cl, read_dirty, dirty_wq); | |
402 | } | |
403 | ||
404 | dc->last_read = KEY_OFFSET(&w->key); | |
405 | ||
406 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | |
407 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | |
408 | GFP_KERNEL); | |
409 | if (!io) | |
410 | goto err; | |
411 | ||
412 | w->private = io; | |
413 | io->dc = dc; | |
414 | ||
415 | dirty_init(w); | |
416 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | |
417 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | |
418 | &w->key, 0)->bdev; | |
419 | io->bio.bi_rw = READ; | |
420 | io->bio.bi_end_io = read_dirty_endio; | |
421 | ||
8e51e414 | 422 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
cafe5635 KO |
423 | goto err_free; |
424 | ||
c37511b8 | 425 | trace_bcache_writeback(&w->key); |
cafe5635 KO |
426 | |
427 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | |
428 | ||
429 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | |
430 | ||
431 | atomic_inc(&dc->in_flight); | |
432 | ||
433 | if (!closure_wait_event(&dc->writeback_wait, cl, | |
434 | atomic_read(&dc->in_flight) < 64)) | |
435 | continue_at(cl, read_dirty, dirty_wq); | |
436 | } | |
437 | ||
438 | if (0) { | |
439 | err_free: | |
440 | kfree(w->private); | |
441 | err: | |
442 | bch_keybuf_del(&dc->writeback_keys, w); | |
443 | } | |
444 | ||
445 | refill_dirty(cl); | |
446 | } | |
447 | ||
444fc0b6 KO |
448 | /* Init */ |
449 | ||
450 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, | |
451 | struct cached_dev *dc) | |
452 | { | |
453 | struct bkey *k; | |
454 | struct btree_iter iter; | |
455 | ||
456 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); | |
457 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) | |
458 | if (!b->level) { | |
459 | if (KEY_INODE(k) > dc->disk.id) | |
460 | break; | |
461 | ||
462 | if (KEY_DIRTY(k)) | |
279afbad KO |
463 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, |
464 | KEY_START(k), | |
465 | KEY_SIZE(k)); | |
444fc0b6 KO |
466 | } else { |
467 | btree(sectors_dirty_init, k, b, op, dc); | |
468 | if (KEY_INODE(k) > dc->disk.id) | |
469 | break; | |
470 | ||
471 | cond_resched(); | |
472 | } | |
473 | ||
474 | return 0; | |
475 | } | |
476 | ||
477 | void bch_sectors_dirty_init(struct cached_dev *dc) | |
478 | { | |
479 | struct btree_op op; | |
480 | ||
481 | bch_btree_op_init_stack(&op); | |
482 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); | |
483 | } | |
484 | ||
f59fce84 | 485 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
cafe5635 KO |
486 | { |
487 | closure_init_unlocked(&dc->writeback); | |
488 | init_rwsem(&dc->writeback_lock); | |
489 | ||
72c27061 | 490 | bch_keybuf_init(&dc->writeback_keys); |
cafe5635 KO |
491 | |
492 | dc->writeback_metadata = true; | |
493 | dc->writeback_running = true; | |
494 | dc->writeback_percent = 10; | |
495 | dc->writeback_delay = 30; | |
496 | dc->writeback_rate.rate = 1024; | |
497 | ||
498 | dc->writeback_rate_update_seconds = 30; | |
499 | dc->writeback_rate_d_term = 16; | |
500 | dc->writeback_rate_p_term_inverse = 64; | |
501 | dc->writeback_rate_d_smooth = 8; | |
502 | ||
503 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | |
504 | schedule_delayed_work(&dc->writeback_rate_update, | |
505 | dc->writeback_rate_update_seconds * HZ); | |
506 | } | |
507 | ||
508 | void bch_writeback_exit(void) | |
509 | { | |
510 | if (dirty_wq) | |
511 | destroy_workqueue(dirty_wq); | |
512 | } | |
513 | ||
514 | int __init bch_writeback_init(void) | |
515 | { | |
516 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | |
517 | if (!dirty_wq) | |
518 | return -ENOMEM; | |
519 | ||
520 | return 0; | |
521 | } |