]> Git Repo - linux.git/blob - drivers/md/dm-bufio.c
Merge tag 'cxl-for-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl
[linux.git] / drivers / md / dm-bufio.c
1 /*
2  * Copyright (C) 2009-2011 Red Hat, Inc.
3  *
4  * Author: Mikulas Patocka <[email protected]>
5  *
6  * This file is released under the GPL.
7  */
8
9 #include <linux/dm-bufio.h>
10
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/slab.h>
14 #include <linux/sched/mm.h>
15 #include <linux/jiffies.h>
16 #include <linux/vmalloc.h>
17 #include <linux/shrinker.h>
18 #include <linux/module.h>
19 #include <linux/rbtree.h>
20 #include <linux/stacktrace.h>
21 #include <linux/jump_label.h>
22
23 #define DM_MSG_PREFIX "bufio"
24
25 /*
26  * Memory management policy:
27  *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
28  *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
29  *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
30  *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
31  *      dirty buffers.
32  */
33 #define DM_BUFIO_MIN_BUFFERS            8
34
35 #define DM_BUFIO_MEMORY_PERCENT         2
36 #define DM_BUFIO_VMALLOC_PERCENT        25
37 #define DM_BUFIO_WRITEBACK_RATIO        3
38 #define DM_BUFIO_LOW_WATERMARK_RATIO    16
39
40 /*
41  * Check buffer ages in this interval (seconds)
42  */
43 #define DM_BUFIO_WORK_TIMER_SECS        30
44
45 /*
46  * Free buffers when they are older than this (seconds)
47  */
48 #define DM_BUFIO_DEFAULT_AGE_SECS       300
49
50 /*
51  * The nr of bytes of cached data to keep around.
52  */
53 #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
54
55 /*
56  * Align buffer writes to this boundary.
57  * Tests show that SSDs have the highest IOPS when using 4k writes.
58  */
59 #define DM_BUFIO_WRITE_ALIGN            4096
60
61 /*
62  * dm_buffer->list_mode
63  */
64 #define LIST_CLEAN      0
65 #define LIST_DIRTY      1
66 #define LIST_SIZE       2
67
68 /*
69  * Linking of buffers:
70  *      All buffers are linked to buffer_tree with their node field.
71  *
72  *      Clean buffers that are not being written (B_WRITING not set)
73  *      are linked to lru[LIST_CLEAN] with their lru_list field.
74  *
75  *      Dirty and clean buffers that are being written are linked to
76  *      lru[LIST_DIRTY] with their lru_list field. When the write
77  *      finishes, the buffer cannot be relinked immediately (because we
78  *      are in an interrupt context and relinking requires process
79  *      context), so some clean-not-writing buffers can be held on
80  *      dirty_lru too.  They are later added to lru in the process
81  *      context.
82  */
83 struct dm_bufio_client {
84         struct mutex lock;
85         spinlock_t spinlock;
86         unsigned long spinlock_flags;
87
88         struct list_head lru[LIST_SIZE];
89         unsigned long n_buffers[LIST_SIZE];
90
91         struct block_device *bdev;
92         unsigned block_size;
93         s8 sectors_per_block_bits;
94         void (*alloc_callback)(struct dm_buffer *);
95         void (*write_callback)(struct dm_buffer *);
96         bool no_sleep;
97
98         struct kmem_cache *slab_buffer;
99         struct kmem_cache *slab_cache;
100         struct dm_io_client *dm_io;
101
102         struct list_head reserved_buffers;
103         unsigned need_reserved_buffers;
104
105         unsigned minimum_buffers;
106
107         struct rb_root buffer_tree;
108         wait_queue_head_t free_buffer_wait;
109
110         sector_t start;
111
112         int async_write_error;
113
114         struct list_head client_list;
115
116         struct shrinker shrinker;
117         struct work_struct shrink_work;
118         atomic_long_t need_shrink;
119 };
120
121 /*
122  * Buffer state bits.
123  */
124 #define B_READING       0
125 #define B_WRITING       1
126 #define B_DIRTY         2
127
128 /*
129  * Describes how the block was allocated:
130  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
131  * See the comment at alloc_buffer_data.
132  */
133 enum data_mode {
134         DATA_MODE_SLAB = 0,
135         DATA_MODE_GET_FREE_PAGES = 1,
136         DATA_MODE_VMALLOC = 2,
137         DATA_MODE_LIMIT = 3
138 };
139
140 struct dm_buffer {
141         struct rb_node node;
142         struct list_head lru_list;
143         struct list_head global_list;
144         sector_t block;
145         void *data;
146         unsigned char data_mode;                /* DATA_MODE_* */
147         unsigned char list_mode;                /* LIST_* */
148         blk_status_t read_error;
149         blk_status_t write_error;
150         unsigned accessed;
151         unsigned hold_count;
152         unsigned long state;
153         unsigned long last_accessed;
154         unsigned dirty_start;
155         unsigned dirty_end;
156         unsigned write_start;
157         unsigned write_end;
158         struct dm_bufio_client *c;
159         struct list_head write_list;
160         void (*end_io)(struct dm_buffer *, blk_status_t);
161 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
162 #define MAX_STACK 10
163         unsigned int stack_len;
164         unsigned long stack_entries[MAX_STACK];
165 #endif
166 };
167
168 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
169
170 /*----------------------------------------------------------------*/
171
172 #define dm_bufio_in_request()   (!!current->bio_list)
173
174 static void dm_bufio_lock(struct dm_bufio_client *c)
175 {
176         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
177                 spin_lock_irqsave_nested(&c->spinlock, c->spinlock_flags, dm_bufio_in_request());
178         else
179                 mutex_lock_nested(&c->lock, dm_bufio_in_request());
180 }
181
182 static int dm_bufio_trylock(struct dm_bufio_client *c)
183 {
184         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
185                 return spin_trylock_irqsave(&c->spinlock, c->spinlock_flags);
186         else
187                 return mutex_trylock(&c->lock);
188 }
189
190 static void dm_bufio_unlock(struct dm_bufio_client *c)
191 {
192         if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
193                 spin_unlock_irqrestore(&c->spinlock, c->spinlock_flags);
194         else
195                 mutex_unlock(&c->lock);
196 }
197
198 /*----------------------------------------------------------------*/
199
200 /*
201  * Default cache size: available memory divided by the ratio.
202  */
203 static unsigned long dm_bufio_default_cache_size;
204
205 /*
206  * Total cache size set by the user.
207  */
208 static unsigned long dm_bufio_cache_size;
209
210 /*
211  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
212  * at any time.  If it disagrees, the user has changed cache size.
213  */
214 static unsigned long dm_bufio_cache_size_latch;
215
216 static DEFINE_SPINLOCK(global_spinlock);
217
218 static LIST_HEAD(global_queue);
219
220 static unsigned long global_num = 0;
221
222 /*
223  * Buffers are freed after this timeout
224  */
225 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
226 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
227
228 static unsigned long dm_bufio_peak_allocated;
229 static unsigned long dm_bufio_allocated_kmem_cache;
230 static unsigned long dm_bufio_allocated_get_free_pages;
231 static unsigned long dm_bufio_allocated_vmalloc;
232 static unsigned long dm_bufio_current_allocated;
233
234 /*----------------------------------------------------------------*/
235
236 /*
237  * The current number of clients.
238  */
239 static int dm_bufio_client_count;
240
241 /*
242  * The list of all clients.
243  */
244 static LIST_HEAD(dm_bufio_all_clients);
245
246 /*
247  * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
248  */
249 static DEFINE_MUTEX(dm_bufio_clients_lock);
250
251 static struct workqueue_struct *dm_bufio_wq;
252 static struct delayed_work dm_bufio_cleanup_old_work;
253 static struct work_struct dm_bufio_replacement_work;
254
255
256 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
257 static void buffer_record_stack(struct dm_buffer *b)
258 {
259         b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
260 }
261 #endif
262
263 /*----------------------------------------------------------------
264  * A red/black tree acts as an index for all the buffers.
265  *--------------------------------------------------------------*/
266 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
267 {
268         struct rb_node *n = c->buffer_tree.rb_node;
269         struct dm_buffer *b;
270
271         while (n) {
272                 b = container_of(n, struct dm_buffer, node);
273
274                 if (b->block == block)
275                         return b;
276
277                 n = block < b->block ? n->rb_left : n->rb_right;
278         }
279
280         return NULL;
281 }
282
283 static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
284 {
285         struct rb_node *n = c->buffer_tree.rb_node;
286         struct dm_buffer *b;
287         struct dm_buffer *best = NULL;
288
289         while (n) {
290                 b = container_of(n, struct dm_buffer, node);
291
292                 if (b->block == block)
293                         return b;
294
295                 if (block <= b->block) {
296                         n = n->rb_left;
297                         best = b;
298                 } else {
299                         n = n->rb_right;
300                 }
301         }
302
303         return best;
304 }
305
306 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
307 {
308         struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
309         struct dm_buffer *found;
310
311         while (*new) {
312                 found = container_of(*new, struct dm_buffer, node);
313
314                 if (found->block == b->block) {
315                         BUG_ON(found != b);
316                         return;
317                 }
318
319                 parent = *new;
320                 new = b->block < found->block ?
321                         &found->node.rb_left : &found->node.rb_right;
322         }
323
324         rb_link_node(&b->node, parent, new);
325         rb_insert_color(&b->node, &c->buffer_tree);
326 }
327
328 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
329 {
330         rb_erase(&b->node, &c->buffer_tree);
331 }
332
333 /*----------------------------------------------------------------*/
334
335 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
336 {
337         unsigned char data_mode;
338         long diff;
339
340         static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
341                 &dm_bufio_allocated_kmem_cache,
342                 &dm_bufio_allocated_get_free_pages,
343                 &dm_bufio_allocated_vmalloc,
344         };
345
346         data_mode = b->data_mode;
347         diff = (long)b->c->block_size;
348         if (unlink)
349                 diff = -diff;
350
351         spin_lock(&global_spinlock);
352
353         *class_ptr[data_mode] += diff;
354
355         dm_bufio_current_allocated += diff;
356
357         if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
358                 dm_bufio_peak_allocated = dm_bufio_current_allocated;
359
360         b->accessed = 1;
361
362         if (!unlink) {
363                 list_add(&b->global_list, &global_queue);
364                 global_num++;
365                 if (dm_bufio_current_allocated > dm_bufio_cache_size)
366                         queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
367         } else {
368                 list_del(&b->global_list);
369                 global_num--;
370         }
371
372         spin_unlock(&global_spinlock);
373 }
374
375 /*
376  * Change the number of clients and recalculate per-client limit.
377  */
378 static void __cache_size_refresh(void)
379 {
380         BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
381         BUG_ON(dm_bufio_client_count < 0);
382
383         dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
384
385         /*
386          * Use default if set to 0 and report the actual cache size used.
387          */
388         if (!dm_bufio_cache_size_latch) {
389                 (void)cmpxchg(&dm_bufio_cache_size, 0,
390                               dm_bufio_default_cache_size);
391                 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
392         }
393 }
394
395 /*
396  * Allocating buffer data.
397  *
398  * Small buffers are allocated with kmem_cache, to use space optimally.
399  *
400  * For large buffers, we choose between get_free_pages and vmalloc.
401  * Each has advantages and disadvantages.
402  *
403  * __get_free_pages can randomly fail if the memory is fragmented.
404  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
405  * as low as 128M) so using it for caching is not appropriate.
406  *
407  * If the allocation may fail we use __get_free_pages. Memory fragmentation
408  * won't have a fatal effect here, but it just causes flushes of some other
409  * buffers and more I/O will be performed. Don't use __get_free_pages if it
410  * always fails (i.e. order >= MAX_ORDER).
411  *
412  * If the allocation shouldn't fail we use __vmalloc. This is only for the
413  * initial reserve allocation, so there's no risk of wasting all vmalloc
414  * space.
415  */
416 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
417                                unsigned char *data_mode)
418 {
419         if (unlikely(c->slab_cache != NULL)) {
420                 *data_mode = DATA_MODE_SLAB;
421                 return kmem_cache_alloc(c->slab_cache, gfp_mask);
422         }
423
424         if (c->block_size <= KMALLOC_MAX_SIZE &&
425             gfp_mask & __GFP_NORETRY) {
426                 *data_mode = DATA_MODE_GET_FREE_PAGES;
427                 return (void *)__get_free_pages(gfp_mask,
428                                                 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
429         }
430
431         *data_mode = DATA_MODE_VMALLOC;
432
433         /*
434          * __vmalloc allocates the data pages and auxiliary structures with
435          * gfp_flags that were specified, but pagetables are always allocated
436          * with GFP_KERNEL, no matter what was specified as gfp_mask.
437          *
438          * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
439          * all allocations done by this process (including pagetables) are done
440          * as if GFP_NOIO was specified.
441          */
442         if (gfp_mask & __GFP_NORETRY) {
443                 unsigned noio_flag = memalloc_noio_save();
444                 void *ptr = __vmalloc(c->block_size, gfp_mask);
445
446                 memalloc_noio_restore(noio_flag);
447                 return ptr;
448         }
449
450         return __vmalloc(c->block_size, gfp_mask);
451 }
452
453 /*
454  * Free buffer's data.
455  */
456 static void free_buffer_data(struct dm_bufio_client *c,
457                              void *data, unsigned char data_mode)
458 {
459         switch (data_mode) {
460         case DATA_MODE_SLAB:
461                 kmem_cache_free(c->slab_cache, data);
462                 break;
463
464         case DATA_MODE_GET_FREE_PAGES:
465                 free_pages((unsigned long)data,
466                            c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
467                 break;
468
469         case DATA_MODE_VMALLOC:
470                 vfree(data);
471                 break;
472
473         default:
474                 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
475                        data_mode);
476                 BUG();
477         }
478 }
479
480 /*
481  * Allocate buffer and its data.
482  */
483 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
484 {
485         struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
486
487         if (!b)
488                 return NULL;
489
490         b->c = c;
491
492         b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
493         if (!b->data) {
494                 kmem_cache_free(c->slab_buffer, b);
495                 return NULL;
496         }
497
498 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
499         b->stack_len = 0;
500 #endif
501         return b;
502 }
503
504 /*
505  * Free buffer and its data.
506  */
507 static void free_buffer(struct dm_buffer *b)
508 {
509         struct dm_bufio_client *c = b->c;
510
511         free_buffer_data(c, b->data, b->data_mode);
512         kmem_cache_free(c->slab_buffer, b);
513 }
514
515 /*
516  * Link buffer to the buffer tree and clean or dirty queue.
517  */
518 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
519 {
520         struct dm_bufio_client *c = b->c;
521
522         c->n_buffers[dirty]++;
523         b->block = block;
524         b->list_mode = dirty;
525         list_add(&b->lru_list, &c->lru[dirty]);
526         __insert(b->c, b);
527         b->last_accessed = jiffies;
528
529         adjust_total_allocated(b, false);
530 }
531
532 /*
533  * Unlink buffer from the buffer tree and dirty or clean queue.
534  */
535 static void __unlink_buffer(struct dm_buffer *b)
536 {
537         struct dm_bufio_client *c = b->c;
538
539         BUG_ON(!c->n_buffers[b->list_mode]);
540
541         c->n_buffers[b->list_mode]--;
542         __remove(b->c, b);
543         list_del(&b->lru_list);
544
545         adjust_total_allocated(b, true);
546 }
547
548 /*
549  * Place the buffer to the head of dirty or clean LRU queue.
550  */
551 static void __relink_lru(struct dm_buffer *b, int dirty)
552 {
553         struct dm_bufio_client *c = b->c;
554
555         b->accessed = 1;
556
557         BUG_ON(!c->n_buffers[b->list_mode]);
558
559         c->n_buffers[b->list_mode]--;
560         c->n_buffers[dirty]++;
561         b->list_mode = dirty;
562         list_move(&b->lru_list, &c->lru[dirty]);
563         b->last_accessed = jiffies;
564 }
565
566 /*----------------------------------------------------------------
567  * Submit I/O on the buffer.
568  *
569  * Bio interface is faster but it has some problems:
570  *      the vector list is limited (increasing this limit increases
571  *      memory-consumption per buffer, so it is not viable);
572  *
573  *      the memory must be direct-mapped, not vmalloced;
574  *
575  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
576  * it is not vmalloced, try using the bio interface.
577  *
578  * If the buffer is big, if it is vmalloced or if the underlying device
579  * rejects the bio because it is too large, use dm-io layer to do the I/O.
580  * The dm-io layer splits the I/O into multiple requests, avoiding the above
581  * shortcomings.
582  *--------------------------------------------------------------*/
583
584 /*
585  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
586  * that the request was handled directly with bio interface.
587  */
588 static void dmio_complete(unsigned long error, void *context)
589 {
590         struct dm_buffer *b = context;
591
592         b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
593 }
594
595 static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
596                      unsigned n_sectors, unsigned offset)
597 {
598         int r;
599         struct dm_io_request io_req = {
600                 .bi_opf = op,
601                 .notify.fn = dmio_complete,
602                 .notify.context = b,
603                 .client = b->c->dm_io,
604         };
605         struct dm_io_region region = {
606                 .bdev = b->c->bdev,
607                 .sector = sector,
608                 .count = n_sectors,
609         };
610
611         if (b->data_mode != DATA_MODE_VMALLOC) {
612                 io_req.mem.type = DM_IO_KMEM;
613                 io_req.mem.ptr.addr = (char *)b->data + offset;
614         } else {
615                 io_req.mem.type = DM_IO_VMA;
616                 io_req.mem.ptr.vma = (char *)b->data + offset;
617         }
618
619         r = dm_io(&io_req, 1, &region, NULL);
620         if (unlikely(r))
621                 b->end_io(b, errno_to_blk_status(r));
622 }
623
624 static void bio_complete(struct bio *bio)
625 {
626         struct dm_buffer *b = bio->bi_private;
627         blk_status_t status = bio->bi_status;
628         bio_uninit(bio);
629         kfree(bio);
630         b->end_io(b, status);
631 }
632
633 static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
634                     unsigned n_sectors, unsigned offset)
635 {
636         struct bio *bio;
637         char *ptr;
638         unsigned vec_size, len;
639
640         vec_size = b->c->block_size >> PAGE_SHIFT;
641         if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
642                 vec_size += 2;
643
644         bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
645         if (!bio) {
646 dmio:
647                 use_dmio(b, op, sector, n_sectors, offset);
648                 return;
649         }
650         bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, op);
651         bio->bi_iter.bi_sector = sector;
652         bio->bi_end_io = bio_complete;
653         bio->bi_private = b;
654
655         ptr = (char *)b->data + offset;
656         len = n_sectors << SECTOR_SHIFT;
657
658         do {
659                 unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
660                 if (!bio_add_page(bio, virt_to_page(ptr), this_step,
661                                   offset_in_page(ptr))) {
662                         bio_put(bio);
663                         goto dmio;
664                 }
665
666                 len -= this_step;
667                 ptr += this_step;
668         } while (len > 0);
669
670         submit_bio(bio);
671 }
672
673 static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
674 {
675         sector_t sector;
676
677         if (likely(c->sectors_per_block_bits >= 0))
678                 sector = block << c->sectors_per_block_bits;
679         else
680                 sector = block * (c->block_size >> SECTOR_SHIFT);
681         sector += c->start;
682
683         return sector;
684 }
685
686 static void submit_io(struct dm_buffer *b, enum req_op op,
687                       void (*end_io)(struct dm_buffer *, blk_status_t))
688 {
689         unsigned n_sectors;
690         sector_t sector;
691         unsigned offset, end;
692
693         b->end_io = end_io;
694
695         sector = block_to_sector(b->c, b->block);
696
697         if (op != REQ_OP_WRITE) {
698                 n_sectors = b->c->block_size >> SECTOR_SHIFT;
699                 offset = 0;
700         } else {
701                 if (b->c->write_callback)
702                         b->c->write_callback(b);
703                 offset = b->write_start;
704                 end = b->write_end;
705                 offset &= -DM_BUFIO_WRITE_ALIGN;
706                 end += DM_BUFIO_WRITE_ALIGN - 1;
707                 end &= -DM_BUFIO_WRITE_ALIGN;
708                 if (unlikely(end > b->c->block_size))
709                         end = b->c->block_size;
710
711                 sector += offset >> SECTOR_SHIFT;
712                 n_sectors = (end - offset) >> SECTOR_SHIFT;
713         }
714
715         if (b->data_mode != DATA_MODE_VMALLOC)
716                 use_bio(b, op, sector, n_sectors, offset);
717         else
718                 use_dmio(b, op, sector, n_sectors, offset);
719 }
720
721 /*----------------------------------------------------------------
722  * Writing dirty buffers
723  *--------------------------------------------------------------*/
724
725 /*
726  * The endio routine for write.
727  *
728  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
729  * it.
730  */
731 static void write_endio(struct dm_buffer *b, blk_status_t status)
732 {
733         b->write_error = status;
734         if (unlikely(status)) {
735                 struct dm_bufio_client *c = b->c;
736
737                 (void)cmpxchg(&c->async_write_error, 0,
738                                 blk_status_to_errno(status));
739         }
740
741         BUG_ON(!test_bit(B_WRITING, &b->state));
742
743         smp_mb__before_atomic();
744         clear_bit(B_WRITING, &b->state);
745         smp_mb__after_atomic();
746
747         wake_up_bit(&b->state, B_WRITING);
748 }
749
750 /*
751  * Initiate a write on a dirty buffer, but don't wait for it.
752  *
753  * - If the buffer is not dirty, exit.
754  * - If there some previous write going on, wait for it to finish (we can't
755  *   have two writes on the same buffer simultaneously).
756  * - Submit our write and don't wait on it. We set B_WRITING indicating
757  *   that there is a write in progress.
758  */
759 static void __write_dirty_buffer(struct dm_buffer *b,
760                                  struct list_head *write_list)
761 {
762         if (!test_bit(B_DIRTY, &b->state))
763                 return;
764
765         clear_bit(B_DIRTY, &b->state);
766         wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
767
768         b->write_start = b->dirty_start;
769         b->write_end = b->dirty_end;
770
771         if (!write_list)
772                 submit_io(b, REQ_OP_WRITE, write_endio);
773         else
774                 list_add_tail(&b->write_list, write_list);
775 }
776
777 static void __flush_write_list(struct list_head *write_list)
778 {
779         struct blk_plug plug;
780         blk_start_plug(&plug);
781         while (!list_empty(write_list)) {
782                 struct dm_buffer *b =
783                         list_entry(write_list->next, struct dm_buffer, write_list);
784                 list_del(&b->write_list);
785                 submit_io(b, REQ_OP_WRITE, write_endio);
786                 cond_resched();
787         }
788         blk_finish_plug(&plug);
789 }
790
791 /*
792  * Wait until any activity on the buffer finishes.  Possibly write the
793  * buffer if it is dirty.  When this function finishes, there is no I/O
794  * running on the buffer and the buffer is not dirty.
795  */
796 static void __make_buffer_clean(struct dm_buffer *b)
797 {
798         BUG_ON(b->hold_count);
799
800         if (!b->state)  /* fast case */
801                 return;
802
803         wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
804         __write_dirty_buffer(b, NULL);
805         wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
806 }
807
808 /*
809  * Find some buffer that is not held by anybody, clean it, unlink it and
810  * return it.
811  */
812 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
813 {
814         struct dm_buffer *b;
815
816         list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
817                 BUG_ON(test_bit(B_WRITING, &b->state));
818                 BUG_ON(test_bit(B_DIRTY, &b->state));
819
820                 if (!b->hold_count) {
821                         __make_buffer_clean(b);
822                         __unlink_buffer(b);
823                         return b;
824                 }
825                 cond_resched();
826         }
827
828         list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
829                 BUG_ON(test_bit(B_READING, &b->state));
830
831                 if (!b->hold_count) {
832                         __make_buffer_clean(b);
833                         __unlink_buffer(b);
834                         return b;
835                 }
836                 cond_resched();
837         }
838
839         return NULL;
840 }
841
842 /*
843  * Wait until some other threads free some buffer or release hold count on
844  * some buffer.
845  *
846  * This function is entered with c->lock held, drops it and regains it
847  * before exiting.
848  */
849 static void __wait_for_free_buffer(struct dm_bufio_client *c)
850 {
851         DECLARE_WAITQUEUE(wait, current);
852
853         add_wait_queue(&c->free_buffer_wait, &wait);
854         set_current_state(TASK_UNINTERRUPTIBLE);
855         dm_bufio_unlock(c);
856
857         io_schedule();
858
859         remove_wait_queue(&c->free_buffer_wait, &wait);
860
861         dm_bufio_lock(c);
862 }
863
864 enum new_flag {
865         NF_FRESH = 0,
866         NF_READ = 1,
867         NF_GET = 2,
868         NF_PREFETCH = 3
869 };
870
871 /*
872  * Allocate a new buffer. If the allocation is not possible, wait until
873  * some other thread frees a buffer.
874  *
875  * May drop the lock and regain it.
876  */
877 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
878 {
879         struct dm_buffer *b;
880         bool tried_noio_alloc = false;
881
882         /*
883          * dm-bufio is resistant to allocation failures (it just keeps
884          * one buffer reserved in cases all the allocations fail).
885          * So set flags to not try too hard:
886          *      GFP_NOWAIT: don't wait; if we need to sleep we'll release our
887          *                  mutex and wait ourselves.
888          *      __GFP_NORETRY: don't retry and rather return failure
889          *      __GFP_NOMEMALLOC: don't use emergency reserves
890          *      __GFP_NOWARN: don't print a warning in case of failure
891          *
892          * For debugging, if we set the cache size to 1, no new buffers will
893          * be allocated.
894          */
895         while (1) {
896                 if (dm_bufio_cache_size_latch != 1) {
897                         b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
898                         if (b)
899                                 return b;
900                 }
901
902                 if (nf == NF_PREFETCH)
903                         return NULL;
904
905                 if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
906                         dm_bufio_unlock(c);
907                         b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
908                         dm_bufio_lock(c);
909                         if (b)
910                                 return b;
911                         tried_noio_alloc = true;
912                 }
913
914                 if (!list_empty(&c->reserved_buffers)) {
915                         b = list_entry(c->reserved_buffers.next,
916                                        struct dm_buffer, lru_list);
917                         list_del(&b->lru_list);
918                         c->need_reserved_buffers++;
919
920                         return b;
921                 }
922
923                 b = __get_unclaimed_buffer(c);
924                 if (b)
925                         return b;
926
927                 __wait_for_free_buffer(c);
928         }
929 }
930
931 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
932 {
933         struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
934
935         if (!b)
936                 return NULL;
937
938         if (c->alloc_callback)
939                 c->alloc_callback(b);
940
941         return b;
942 }
943
944 /*
945  * Free a buffer and wake other threads waiting for free buffers.
946  */
947 static void __free_buffer_wake(struct dm_buffer *b)
948 {
949         struct dm_bufio_client *c = b->c;
950
951         if (!c->need_reserved_buffers)
952                 free_buffer(b);
953         else {
954                 list_add(&b->lru_list, &c->reserved_buffers);
955                 c->need_reserved_buffers--;
956         }
957
958         wake_up(&c->free_buffer_wait);
959 }
960
961 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
962                                         struct list_head *write_list)
963 {
964         struct dm_buffer *b, *tmp;
965
966         list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
967                 BUG_ON(test_bit(B_READING, &b->state));
968
969                 if (!test_bit(B_DIRTY, &b->state) &&
970                     !test_bit(B_WRITING, &b->state)) {
971                         __relink_lru(b, LIST_CLEAN);
972                         continue;
973                 }
974
975                 if (no_wait && test_bit(B_WRITING, &b->state))
976                         return;
977
978                 __write_dirty_buffer(b, write_list);
979                 cond_resched();
980         }
981 }
982
983 /*
984  * Check if we're over watermark.
985  * If we are over threshold_buffers, start freeing buffers.
986  * If we're over "limit_buffers", block until we get under the limit.
987  */
988 static void __check_watermark(struct dm_bufio_client *c,
989                               struct list_head *write_list)
990 {
991         if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
992                 __write_dirty_buffers_async(c, 1, write_list);
993 }
994
995 /*----------------------------------------------------------------
996  * Getting a buffer
997  *--------------------------------------------------------------*/
998
999 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1000                                      enum new_flag nf, int *need_submit,
1001                                      struct list_head *write_list)
1002 {
1003         struct dm_buffer *b, *new_b = NULL;
1004
1005         *need_submit = 0;
1006
1007         b = __find(c, block);
1008         if (b)
1009                 goto found_buffer;
1010
1011         if (nf == NF_GET)
1012                 return NULL;
1013
1014         new_b = __alloc_buffer_wait(c, nf);
1015         if (!new_b)
1016                 return NULL;
1017
1018         /*
1019          * We've had a period where the mutex was unlocked, so need to
1020          * recheck the buffer tree.
1021          */
1022         b = __find(c, block);
1023         if (b) {
1024                 __free_buffer_wake(new_b);
1025                 goto found_buffer;
1026         }
1027
1028         __check_watermark(c, write_list);
1029
1030         b = new_b;
1031         b->hold_count = 1;
1032         b->read_error = 0;
1033         b->write_error = 0;
1034         __link_buffer(b, block, LIST_CLEAN);
1035
1036         if (nf == NF_FRESH) {
1037                 b->state = 0;
1038                 return b;
1039         }
1040
1041         b->state = 1 << B_READING;
1042         *need_submit = 1;
1043
1044         return b;
1045
1046 found_buffer:
1047         if (nf == NF_PREFETCH)
1048                 return NULL;
1049         /*
1050          * Note: it is essential that we don't wait for the buffer to be
1051          * read if dm_bufio_get function is used. Both dm_bufio_get and
1052          * dm_bufio_prefetch can be used in the driver request routine.
1053          * If the user called both dm_bufio_prefetch and dm_bufio_get on
1054          * the same buffer, it would deadlock if we waited.
1055          */
1056         if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1057                 return NULL;
1058
1059         b->hold_count++;
1060         __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1061                      test_bit(B_WRITING, &b->state));
1062         return b;
1063 }
1064
1065 /*
1066  * The endio routine for reading: set the error, clear the bit and wake up
1067  * anyone waiting on the buffer.
1068  */
1069 static void read_endio(struct dm_buffer *b, blk_status_t status)
1070 {
1071         b->read_error = status;
1072
1073         BUG_ON(!test_bit(B_READING, &b->state));
1074
1075         smp_mb__before_atomic();
1076         clear_bit(B_READING, &b->state);
1077         smp_mb__after_atomic();
1078
1079         wake_up_bit(&b->state, B_READING);
1080 }
1081
1082 /*
1083  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1084  * functions is similar except that dm_bufio_new doesn't read the
1085  * buffer from the disk (assuming that the caller overwrites all the data
1086  * and uses dm_bufio_mark_buffer_dirty to write new data back).
1087  */
1088 static void *new_read(struct dm_bufio_client *c, sector_t block,
1089                       enum new_flag nf, struct dm_buffer **bp)
1090 {
1091         int need_submit;
1092         struct dm_buffer *b;
1093
1094         LIST_HEAD(write_list);
1095
1096         dm_bufio_lock(c);
1097         b = __bufio_new(c, block, nf, &need_submit, &write_list);
1098 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1099         if (b && b->hold_count == 1)
1100                 buffer_record_stack(b);
1101 #endif
1102         dm_bufio_unlock(c);
1103
1104         __flush_write_list(&write_list);
1105
1106         if (!b)
1107                 return NULL;
1108
1109         if (need_submit)
1110                 submit_io(b, REQ_OP_READ, read_endio);
1111
1112         wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1113
1114         if (b->read_error) {
1115                 int error = blk_status_to_errno(b->read_error);
1116
1117                 dm_bufio_release(b);
1118
1119                 return ERR_PTR(error);
1120         }
1121
1122         *bp = b;
1123
1124         return b->data;
1125 }
1126
1127 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1128                    struct dm_buffer **bp)
1129 {
1130         return new_read(c, block, NF_GET, bp);
1131 }
1132 EXPORT_SYMBOL_GPL(dm_bufio_get);
1133
1134 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1135                     struct dm_buffer **bp)
1136 {
1137         BUG_ON(dm_bufio_in_request());
1138
1139         return new_read(c, block, NF_READ, bp);
1140 }
1141 EXPORT_SYMBOL_GPL(dm_bufio_read);
1142
1143 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1144                    struct dm_buffer **bp)
1145 {
1146         BUG_ON(dm_bufio_in_request());
1147
1148         return new_read(c, block, NF_FRESH, bp);
1149 }
1150 EXPORT_SYMBOL_GPL(dm_bufio_new);
1151
1152 void dm_bufio_prefetch(struct dm_bufio_client *c,
1153                        sector_t block, unsigned n_blocks)
1154 {
1155         struct blk_plug plug;
1156
1157         LIST_HEAD(write_list);
1158
1159         BUG_ON(dm_bufio_in_request());
1160
1161         blk_start_plug(&plug);
1162         dm_bufio_lock(c);
1163
1164         for (; n_blocks--; block++) {
1165                 int need_submit;
1166                 struct dm_buffer *b;
1167                 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1168                                 &write_list);
1169                 if (unlikely(!list_empty(&write_list))) {
1170                         dm_bufio_unlock(c);
1171                         blk_finish_plug(&plug);
1172                         __flush_write_list(&write_list);
1173                         blk_start_plug(&plug);
1174                         dm_bufio_lock(c);
1175                 }
1176                 if (unlikely(b != NULL)) {
1177                         dm_bufio_unlock(c);
1178
1179                         if (need_submit)
1180                                 submit_io(b, REQ_OP_READ, read_endio);
1181                         dm_bufio_release(b);
1182
1183                         cond_resched();
1184
1185                         if (!n_blocks)
1186                                 goto flush_plug;
1187                         dm_bufio_lock(c);
1188                 }
1189         }
1190
1191         dm_bufio_unlock(c);
1192
1193 flush_plug:
1194         blk_finish_plug(&plug);
1195 }
1196 EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1197
1198 void dm_bufio_release(struct dm_buffer *b)
1199 {
1200         struct dm_bufio_client *c = b->c;
1201
1202         dm_bufio_lock(c);
1203
1204         BUG_ON(!b->hold_count);
1205
1206         b->hold_count--;
1207         if (!b->hold_count) {
1208                 wake_up(&c->free_buffer_wait);
1209
1210                 /*
1211                  * If there were errors on the buffer, and the buffer is not
1212                  * to be written, free the buffer. There is no point in caching
1213                  * invalid buffer.
1214                  */
1215                 if ((b->read_error || b->write_error) &&
1216                     !test_bit(B_READING, &b->state) &&
1217                     !test_bit(B_WRITING, &b->state) &&
1218                     !test_bit(B_DIRTY, &b->state)) {
1219                         __unlink_buffer(b);
1220                         __free_buffer_wake(b);
1221                 }
1222         }
1223
1224         dm_bufio_unlock(c);
1225 }
1226 EXPORT_SYMBOL_GPL(dm_bufio_release);
1227
1228 void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1229                                         unsigned start, unsigned end)
1230 {
1231         struct dm_bufio_client *c = b->c;
1232
1233         BUG_ON(start >= end);
1234         BUG_ON(end > b->c->block_size);
1235
1236         dm_bufio_lock(c);
1237
1238         BUG_ON(test_bit(B_READING, &b->state));
1239
1240         if (!test_and_set_bit(B_DIRTY, &b->state)) {
1241                 b->dirty_start = start;
1242                 b->dirty_end = end;
1243                 __relink_lru(b, LIST_DIRTY);
1244         } else {
1245                 if (start < b->dirty_start)
1246                         b->dirty_start = start;
1247                 if (end > b->dirty_end)
1248                         b->dirty_end = end;
1249         }
1250
1251         dm_bufio_unlock(c);
1252 }
1253 EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1254
1255 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1256 {
1257         dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1258 }
1259 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1260
1261 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1262 {
1263         LIST_HEAD(write_list);
1264
1265         BUG_ON(dm_bufio_in_request());
1266
1267         dm_bufio_lock(c);
1268         __write_dirty_buffers_async(c, 0, &write_list);
1269         dm_bufio_unlock(c);
1270         __flush_write_list(&write_list);
1271 }
1272 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1273
1274 /*
1275  * For performance, it is essential that the buffers are written asynchronously
1276  * and simultaneously (so that the block layer can merge the writes) and then
1277  * waited upon.
1278  *
1279  * Finally, we flush hardware disk cache.
1280  */
1281 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1282 {
1283         int a, f;
1284         unsigned long buffers_processed = 0;
1285         struct dm_buffer *b, *tmp;
1286
1287         LIST_HEAD(write_list);
1288
1289         dm_bufio_lock(c);
1290         __write_dirty_buffers_async(c, 0, &write_list);
1291         dm_bufio_unlock(c);
1292         __flush_write_list(&write_list);
1293         dm_bufio_lock(c);
1294
1295 again:
1296         list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1297                 int dropped_lock = 0;
1298
1299                 if (buffers_processed < c->n_buffers[LIST_DIRTY])
1300                         buffers_processed++;
1301
1302                 BUG_ON(test_bit(B_READING, &b->state));
1303
1304                 if (test_bit(B_WRITING, &b->state)) {
1305                         if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1306                                 dropped_lock = 1;
1307                                 b->hold_count++;
1308                                 dm_bufio_unlock(c);
1309                                 wait_on_bit_io(&b->state, B_WRITING,
1310                                                TASK_UNINTERRUPTIBLE);
1311                                 dm_bufio_lock(c);
1312                                 b->hold_count--;
1313                         } else
1314                                 wait_on_bit_io(&b->state, B_WRITING,
1315                                                TASK_UNINTERRUPTIBLE);
1316                 }
1317
1318                 if (!test_bit(B_DIRTY, &b->state) &&
1319                     !test_bit(B_WRITING, &b->state))
1320                         __relink_lru(b, LIST_CLEAN);
1321
1322                 cond_resched();
1323
1324                 /*
1325                  * If we dropped the lock, the list is no longer consistent,
1326                  * so we must restart the search.
1327                  *
1328                  * In the most common case, the buffer just processed is
1329                  * relinked to the clean list, so we won't loop scanning the
1330                  * same buffer again and again.
1331                  *
1332                  * This may livelock if there is another thread simultaneously
1333                  * dirtying buffers, so we count the number of buffers walked
1334                  * and if it exceeds the total number of buffers, it means that
1335                  * someone is doing some writes simultaneously with us.  In
1336                  * this case, stop, dropping the lock.
1337                  */
1338                 if (dropped_lock)
1339                         goto again;
1340         }
1341         wake_up(&c->free_buffer_wait);
1342         dm_bufio_unlock(c);
1343
1344         a = xchg(&c->async_write_error, 0);
1345         f = dm_bufio_issue_flush(c);
1346         if (a)
1347                 return a;
1348
1349         return f;
1350 }
1351 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1352
1353 /*
1354  * Use dm-io to send an empty barrier to flush the device.
1355  */
1356 int dm_bufio_issue_flush(struct dm_bufio_client *c)
1357 {
1358         struct dm_io_request io_req = {
1359                 .bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
1360                 .mem.type = DM_IO_KMEM,
1361                 .mem.ptr.addr = NULL,
1362                 .client = c->dm_io,
1363         };
1364         struct dm_io_region io_reg = {
1365                 .bdev = c->bdev,
1366                 .sector = 0,
1367                 .count = 0,
1368         };
1369
1370         BUG_ON(dm_bufio_in_request());
1371
1372         return dm_io(&io_req, 1, &io_reg, NULL);
1373 }
1374 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1375
1376 /*
1377  * Use dm-io to send a discard request to flush the device.
1378  */
1379 int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
1380 {
1381         struct dm_io_request io_req = {
1382                 .bi_opf = REQ_OP_DISCARD | REQ_SYNC,
1383                 .mem.type = DM_IO_KMEM,
1384                 .mem.ptr.addr = NULL,
1385                 .client = c->dm_io,
1386         };
1387         struct dm_io_region io_reg = {
1388                 .bdev = c->bdev,
1389                 .sector = block_to_sector(c, block),
1390                 .count = block_to_sector(c, count),
1391         };
1392
1393         BUG_ON(dm_bufio_in_request());
1394
1395         return dm_io(&io_req, 1, &io_reg, NULL);
1396 }
1397 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
1398
1399 /*
1400  * We first delete any other buffer that may be at that new location.
1401  *
1402  * Then, we write the buffer to the original location if it was dirty.
1403  *
1404  * Then, if we are the only one who is holding the buffer, relink the buffer
1405  * in the buffer tree for the new location.
1406  *
1407  * If there was someone else holding the buffer, we write it to the new
1408  * location but not relink it, because that other user needs to have the buffer
1409  * at the same place.
1410  */
1411 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1412 {
1413         struct dm_bufio_client *c = b->c;
1414         struct dm_buffer *new;
1415
1416         BUG_ON(dm_bufio_in_request());
1417
1418         dm_bufio_lock(c);
1419
1420 retry:
1421         new = __find(c, new_block);
1422         if (new) {
1423                 if (new->hold_count) {
1424                         __wait_for_free_buffer(c);
1425                         goto retry;
1426                 }
1427
1428                 /*
1429                  * FIXME: Is there any point waiting for a write that's going
1430                  * to be overwritten in a bit?
1431                  */
1432                 __make_buffer_clean(new);
1433                 __unlink_buffer(new);
1434                 __free_buffer_wake(new);
1435         }
1436
1437         BUG_ON(!b->hold_count);
1438         BUG_ON(test_bit(B_READING, &b->state));
1439
1440         __write_dirty_buffer(b, NULL);
1441         if (b->hold_count == 1) {
1442                 wait_on_bit_io(&b->state, B_WRITING,
1443                                TASK_UNINTERRUPTIBLE);
1444                 set_bit(B_DIRTY, &b->state);
1445                 b->dirty_start = 0;
1446                 b->dirty_end = c->block_size;
1447                 __unlink_buffer(b);
1448                 __link_buffer(b, new_block, LIST_DIRTY);
1449         } else {
1450                 sector_t old_block;
1451                 wait_on_bit_lock_io(&b->state, B_WRITING,
1452                                     TASK_UNINTERRUPTIBLE);
1453                 /*
1454                  * Relink buffer to "new_block" so that write_callback
1455                  * sees "new_block" as a block number.
1456                  * After the write, link the buffer back to old_block.
1457                  * All this must be done in bufio lock, so that block number
1458                  * change isn't visible to other threads.
1459                  */
1460                 old_block = b->block;
1461                 __unlink_buffer(b);
1462                 __link_buffer(b, new_block, b->list_mode);
1463                 submit_io(b, REQ_OP_WRITE, write_endio);
1464                 wait_on_bit_io(&b->state, B_WRITING,
1465                                TASK_UNINTERRUPTIBLE);
1466                 __unlink_buffer(b);
1467                 __link_buffer(b, old_block, b->list_mode);
1468         }
1469
1470         dm_bufio_unlock(c);
1471         dm_bufio_release(b);
1472 }
1473 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1474
1475 static void forget_buffer_locked(struct dm_buffer *b)
1476 {
1477         if (likely(!b->hold_count) && likely(!b->state)) {
1478                 __unlink_buffer(b);
1479                 __free_buffer_wake(b);
1480         }
1481 }
1482
1483 /*
1484  * Free the given buffer.
1485  *
1486  * This is just a hint, if the buffer is in use or dirty, this function
1487  * does nothing.
1488  */
1489 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1490 {
1491         struct dm_buffer *b;
1492
1493         dm_bufio_lock(c);
1494
1495         b = __find(c, block);
1496         if (b)
1497                 forget_buffer_locked(b);
1498
1499         dm_bufio_unlock(c);
1500 }
1501 EXPORT_SYMBOL_GPL(dm_bufio_forget);
1502
1503 void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
1504 {
1505         struct dm_buffer *b;
1506         sector_t end_block = block + n_blocks;
1507
1508         while (block < end_block) {
1509                 dm_bufio_lock(c);
1510
1511                 b = __find_next(c, block);
1512                 if (b) {
1513                         block = b->block + 1;
1514                         forget_buffer_locked(b);
1515                 }
1516
1517                 dm_bufio_unlock(c);
1518
1519                 if (!b)
1520                         break;
1521         }
1522
1523 }
1524 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
1525
1526 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1527 {
1528         c->minimum_buffers = n;
1529 }
1530 EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1531
1532 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1533 {
1534         return c->block_size;
1535 }
1536 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1537
1538 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1539 {
1540         sector_t s = bdev_nr_sectors(c->bdev);
1541         if (s >= c->start)
1542                 s -= c->start;
1543         else
1544                 s = 0;
1545         if (likely(c->sectors_per_block_bits >= 0))
1546                 s >>= c->sectors_per_block_bits;
1547         else
1548                 sector_div(s, c->block_size >> SECTOR_SHIFT);
1549         return s;
1550 }
1551 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1552
1553 struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
1554 {
1555         return c->dm_io;
1556 }
1557 EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
1558
1559 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1560 {
1561         return b->block;
1562 }
1563 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1564
1565 void *dm_bufio_get_block_data(struct dm_buffer *b)
1566 {
1567         return b->data;
1568 }
1569 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1570
1571 void *dm_bufio_get_aux_data(struct dm_buffer *b)
1572 {
1573         return b + 1;
1574 }
1575 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1576
1577 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1578 {
1579         return b->c;
1580 }
1581 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1582
1583 static void drop_buffers(struct dm_bufio_client *c)
1584 {
1585         struct dm_buffer *b;
1586         int i;
1587         bool warned = false;
1588
1589         BUG_ON(dm_bufio_in_request());
1590
1591         /*
1592          * An optimization so that the buffers are not written one-by-one.
1593          */
1594         dm_bufio_write_dirty_buffers_async(c);
1595
1596         dm_bufio_lock(c);
1597
1598         while ((b = __get_unclaimed_buffer(c)))
1599                 __free_buffer_wake(b);
1600
1601         for (i = 0; i < LIST_SIZE; i++)
1602                 list_for_each_entry(b, &c->lru[i], lru_list) {
1603                         WARN_ON(!warned);
1604                         warned = true;
1605                         DMERR("leaked buffer %llx, hold count %u, list %d",
1606                               (unsigned long long)b->block, b->hold_count, i);
1607 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1608                         stack_trace_print(b->stack_entries, b->stack_len, 1);
1609                         /* mark unclaimed to avoid BUG_ON below */
1610                         b->hold_count = 0;
1611 #endif
1612                 }
1613
1614 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1615         while ((b = __get_unclaimed_buffer(c)))
1616                 __free_buffer_wake(b);
1617 #endif
1618
1619         for (i = 0; i < LIST_SIZE; i++)
1620                 BUG_ON(!list_empty(&c->lru[i]));
1621
1622         dm_bufio_unlock(c);
1623 }
1624
1625 /*
1626  * We may not be able to evict this buffer if IO pending or the client
1627  * is still using it.  Caller is expected to know buffer is too old.
1628  *
1629  * And if GFP_NOFS is used, we must not do any I/O because we hold
1630  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1631  * rerouted to different bufio client.
1632  */
1633 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1634 {
1635         if (!(gfp & __GFP_FS)) {
1636                 if (test_bit(B_READING, &b->state) ||
1637                     test_bit(B_WRITING, &b->state) ||
1638                     test_bit(B_DIRTY, &b->state))
1639                         return false;
1640         }
1641
1642         if (b->hold_count)
1643                 return false;
1644
1645         __make_buffer_clean(b);
1646         __unlink_buffer(b);
1647         __free_buffer_wake(b);
1648
1649         return true;
1650 }
1651
1652 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1653 {
1654         unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1655         if (likely(c->sectors_per_block_bits >= 0))
1656                 retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1657         else
1658                 retain_bytes /= c->block_size;
1659         return retain_bytes;
1660 }
1661
1662 static void __scan(struct dm_bufio_client *c)
1663 {
1664         int l;
1665         struct dm_buffer *b, *tmp;
1666         unsigned long freed = 0;
1667         unsigned long count = c->n_buffers[LIST_CLEAN] +
1668                               c->n_buffers[LIST_DIRTY];
1669         unsigned long retain_target = get_retain_buffers(c);
1670
1671         for (l = 0; l < LIST_SIZE; l++) {
1672                 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1673                         if (count - freed <= retain_target)
1674                                 atomic_long_set(&c->need_shrink, 0);
1675                         if (!atomic_long_read(&c->need_shrink))
1676                                 return;
1677                         if (__try_evict_buffer(b, GFP_KERNEL)) {
1678                                 atomic_long_dec(&c->need_shrink);
1679                                 freed++;
1680                         }
1681                         cond_resched();
1682                 }
1683         }
1684 }
1685
1686 static void shrink_work(struct work_struct *w)
1687 {
1688         struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
1689
1690         dm_bufio_lock(c);
1691         __scan(c);
1692         dm_bufio_unlock(c);
1693 }
1694
1695 static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1696 {
1697         struct dm_bufio_client *c;
1698
1699         c = container_of(shrink, struct dm_bufio_client, shrinker);
1700         atomic_long_add(sc->nr_to_scan, &c->need_shrink);
1701         queue_work(dm_bufio_wq, &c->shrink_work);
1702
1703         return sc->nr_to_scan;
1704 }
1705
1706 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1707 {
1708         struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1709         unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1710                               READ_ONCE(c->n_buffers[LIST_DIRTY]);
1711         unsigned long retain_target = get_retain_buffers(c);
1712         unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
1713
1714         if (unlikely(count < retain_target))
1715                 count = 0;
1716         else
1717                 count -= retain_target;
1718
1719         if (unlikely(count < queued_for_cleanup))
1720                 count = 0;
1721         else
1722                 count -= queued_for_cleanup;
1723
1724         return count;
1725 }
1726
1727 /*
1728  * Create the buffering interface
1729  */
1730 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1731                                                unsigned reserved_buffers, unsigned aux_size,
1732                                                void (*alloc_callback)(struct dm_buffer *),
1733                                                void (*write_callback)(struct dm_buffer *),
1734                                                unsigned int flags)
1735 {
1736         int r;
1737         struct dm_bufio_client *c;
1738         unsigned i;
1739         char slab_name[27];
1740
1741         if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1742                 DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1743                 r = -EINVAL;
1744                 goto bad_client;
1745         }
1746
1747         c = kzalloc(sizeof(*c), GFP_KERNEL);
1748         if (!c) {
1749                 r = -ENOMEM;
1750                 goto bad_client;
1751         }
1752         c->buffer_tree = RB_ROOT;
1753
1754         c->bdev = bdev;
1755         c->block_size = block_size;
1756         if (is_power_of_2(block_size))
1757                 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1758         else
1759                 c->sectors_per_block_bits = -1;
1760
1761         c->alloc_callback = alloc_callback;
1762         c->write_callback = write_callback;
1763
1764         if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
1765                 c->no_sleep = true;
1766                 static_branch_inc(&no_sleep_enabled);
1767         }
1768
1769         for (i = 0; i < LIST_SIZE; i++) {
1770                 INIT_LIST_HEAD(&c->lru[i]);
1771                 c->n_buffers[i] = 0;
1772         }
1773
1774         mutex_init(&c->lock);
1775         spin_lock_init(&c->spinlock);
1776         INIT_LIST_HEAD(&c->reserved_buffers);
1777         c->need_reserved_buffers = reserved_buffers;
1778
1779         dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1780
1781         init_waitqueue_head(&c->free_buffer_wait);
1782         c->async_write_error = 0;
1783
1784         c->dm_io = dm_io_client_create();
1785         if (IS_ERR(c->dm_io)) {
1786                 r = PTR_ERR(c->dm_io);
1787                 goto bad_dm_io;
1788         }
1789
1790         if (block_size <= KMALLOC_MAX_SIZE &&
1791             (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1792                 unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
1793                 snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1794                 c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1795                                                   SLAB_RECLAIM_ACCOUNT, NULL);
1796                 if (!c->slab_cache) {
1797                         r = -ENOMEM;
1798                         goto bad;
1799                 }
1800         }
1801         if (aux_size)
1802                 snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1803         else
1804                 snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1805         c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1806                                            0, SLAB_RECLAIM_ACCOUNT, NULL);
1807         if (!c->slab_buffer) {
1808                 r = -ENOMEM;
1809                 goto bad;
1810         }
1811
1812         while (c->need_reserved_buffers) {
1813                 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1814
1815                 if (!b) {
1816                         r = -ENOMEM;
1817                         goto bad;
1818                 }
1819                 __free_buffer_wake(b);
1820         }
1821
1822         INIT_WORK(&c->shrink_work, shrink_work);
1823         atomic_long_set(&c->need_shrink, 0);
1824
1825         c->shrinker.count_objects = dm_bufio_shrink_count;
1826         c->shrinker.scan_objects = dm_bufio_shrink_scan;
1827         c->shrinker.seeks = 1;
1828         c->shrinker.batch = 0;
1829         r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name,
1830                               MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
1831         if (r)
1832                 goto bad;
1833
1834         mutex_lock(&dm_bufio_clients_lock);
1835         dm_bufio_client_count++;
1836         list_add(&c->client_list, &dm_bufio_all_clients);
1837         __cache_size_refresh();
1838         mutex_unlock(&dm_bufio_clients_lock);
1839
1840         return c;
1841
1842 bad:
1843         while (!list_empty(&c->reserved_buffers)) {
1844                 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1845                                                  struct dm_buffer, lru_list);
1846                 list_del(&b->lru_list);
1847                 free_buffer(b);
1848         }
1849         kmem_cache_destroy(c->slab_cache);
1850         kmem_cache_destroy(c->slab_buffer);
1851         dm_io_client_destroy(c->dm_io);
1852 bad_dm_io:
1853         mutex_destroy(&c->lock);
1854         kfree(c);
1855 bad_client:
1856         return ERR_PTR(r);
1857 }
1858 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1859
1860 /*
1861  * Free the buffering interface.
1862  * It is required that there are no references on any buffers.
1863  */
1864 void dm_bufio_client_destroy(struct dm_bufio_client *c)
1865 {
1866         unsigned i;
1867
1868         drop_buffers(c);
1869
1870         unregister_shrinker(&c->shrinker);
1871         flush_work(&c->shrink_work);
1872
1873         mutex_lock(&dm_bufio_clients_lock);
1874
1875         list_del(&c->client_list);
1876         dm_bufio_client_count--;
1877         __cache_size_refresh();
1878
1879         mutex_unlock(&dm_bufio_clients_lock);
1880
1881         BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1882         BUG_ON(c->need_reserved_buffers);
1883
1884         while (!list_empty(&c->reserved_buffers)) {
1885                 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1886                                                  struct dm_buffer, lru_list);
1887                 list_del(&b->lru_list);
1888                 free_buffer(b);
1889         }
1890
1891         for (i = 0; i < LIST_SIZE; i++)
1892                 if (c->n_buffers[i])
1893                         DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1894
1895         for (i = 0; i < LIST_SIZE; i++)
1896                 BUG_ON(c->n_buffers[i]);
1897
1898         kmem_cache_destroy(c->slab_cache);
1899         kmem_cache_destroy(c->slab_buffer);
1900         dm_io_client_destroy(c->dm_io);
1901         mutex_destroy(&c->lock);
1902         if (c->no_sleep)
1903                 static_branch_dec(&no_sleep_enabled);
1904         kfree(c);
1905 }
1906 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1907
1908 void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1909 {
1910         c->start = start;
1911 }
1912 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1913
1914 static unsigned get_max_age_hz(void)
1915 {
1916         unsigned max_age = READ_ONCE(dm_bufio_max_age);
1917
1918         if (max_age > UINT_MAX / HZ)
1919                 max_age = UINT_MAX / HZ;
1920
1921         return max_age * HZ;
1922 }
1923
1924 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1925 {
1926         return time_after_eq(jiffies, b->last_accessed + age_hz);
1927 }
1928
1929 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1930 {
1931         struct dm_buffer *b, *tmp;
1932         unsigned long retain_target = get_retain_buffers(c);
1933         unsigned long count;
1934         LIST_HEAD(write_list);
1935
1936         dm_bufio_lock(c);
1937
1938         __check_watermark(c, &write_list);
1939         if (unlikely(!list_empty(&write_list))) {
1940                 dm_bufio_unlock(c);
1941                 __flush_write_list(&write_list);
1942                 dm_bufio_lock(c);
1943         }
1944
1945         count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1946         list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1947                 if (count <= retain_target)
1948                         break;
1949
1950                 if (!older_than(b, age_hz))
1951                         break;
1952
1953                 if (__try_evict_buffer(b, 0))
1954                         count--;
1955
1956                 cond_resched();
1957         }
1958
1959         dm_bufio_unlock(c);
1960 }
1961
1962 static void do_global_cleanup(struct work_struct *w)
1963 {
1964         struct dm_bufio_client *locked_client = NULL;
1965         struct dm_bufio_client *current_client;
1966         struct dm_buffer *b;
1967         unsigned spinlock_hold_count;
1968         unsigned long threshold = dm_bufio_cache_size -
1969                 dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1970         unsigned long loops = global_num * 2;
1971
1972         mutex_lock(&dm_bufio_clients_lock);
1973
1974         while (1) {
1975                 cond_resched();
1976
1977                 spin_lock(&global_spinlock);
1978                 if (unlikely(dm_bufio_current_allocated <= threshold))
1979                         break;
1980
1981                 spinlock_hold_count = 0;
1982 get_next:
1983                 if (!loops--)
1984                         break;
1985                 if (unlikely(list_empty(&global_queue)))
1986                         break;
1987                 b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1988
1989                 if (b->accessed) {
1990                         b->accessed = 0;
1991                         list_move(&b->global_list, &global_queue);
1992                         if (likely(++spinlock_hold_count < 16))
1993                                 goto get_next;
1994                         spin_unlock(&global_spinlock);
1995                         continue;
1996                 }
1997
1998                 current_client = b->c;
1999                 if (unlikely(current_client != locked_client)) {
2000                         if (locked_client)
2001                                 dm_bufio_unlock(locked_client);
2002
2003                         if (!dm_bufio_trylock(current_client)) {
2004                                 spin_unlock(&global_spinlock);
2005                                 dm_bufio_lock(current_client);
2006                                 locked_client = current_client;
2007                                 continue;
2008                         }
2009
2010                         locked_client = current_client;
2011                 }
2012
2013                 spin_unlock(&global_spinlock);
2014
2015                 if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
2016                         spin_lock(&global_spinlock);
2017                         list_move(&b->global_list, &global_queue);
2018                         spin_unlock(&global_spinlock);
2019                 }
2020         }
2021
2022         spin_unlock(&global_spinlock);
2023
2024         if (locked_client)
2025                 dm_bufio_unlock(locked_client);
2026
2027         mutex_unlock(&dm_bufio_clients_lock);
2028 }
2029
2030 static void cleanup_old_buffers(void)
2031 {
2032         unsigned long max_age_hz = get_max_age_hz();
2033         struct dm_bufio_client *c;
2034
2035         mutex_lock(&dm_bufio_clients_lock);
2036
2037         __cache_size_refresh();
2038
2039         list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2040                 __evict_old_buffers(c, max_age_hz);
2041
2042         mutex_unlock(&dm_bufio_clients_lock);
2043 }
2044
2045 static void work_fn(struct work_struct *w)
2046 {
2047         cleanup_old_buffers();
2048
2049         queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2050                            DM_BUFIO_WORK_TIMER_SECS * HZ);
2051 }
2052
2053 /*----------------------------------------------------------------
2054  * Module setup
2055  *--------------------------------------------------------------*/
2056
2057 /*
2058  * This is called only once for the whole dm_bufio module.
2059  * It initializes memory limit.
2060  */
2061 static int __init dm_bufio_init(void)
2062 {
2063         __u64 mem;
2064
2065         dm_bufio_allocated_kmem_cache = 0;
2066         dm_bufio_allocated_get_free_pages = 0;
2067         dm_bufio_allocated_vmalloc = 0;
2068         dm_bufio_current_allocated = 0;
2069
2070         mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2071                                DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2072
2073         if (mem > ULONG_MAX)
2074                 mem = ULONG_MAX;
2075
2076 #ifdef CONFIG_MMU
2077         if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2078                 mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2079 #endif
2080
2081         dm_bufio_default_cache_size = mem;
2082
2083         mutex_lock(&dm_bufio_clients_lock);
2084         __cache_size_refresh();
2085         mutex_unlock(&dm_bufio_clients_lock);
2086
2087         dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2088         if (!dm_bufio_wq)
2089                 return -ENOMEM;
2090
2091         INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2092         INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2093         queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2094                            DM_BUFIO_WORK_TIMER_SECS * HZ);
2095
2096         return 0;
2097 }
2098
2099 /*
2100  * This is called once when unloading the dm_bufio module.
2101  */
2102 static void __exit dm_bufio_exit(void)
2103 {
2104         int bug = 0;
2105
2106         cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2107         destroy_workqueue(dm_bufio_wq);
2108
2109         if (dm_bufio_client_count) {
2110                 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2111                         __func__, dm_bufio_client_count);
2112                 bug = 1;
2113         }
2114
2115         if (dm_bufio_current_allocated) {
2116                 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2117                         __func__, dm_bufio_current_allocated);
2118                 bug = 1;
2119         }
2120
2121         if (dm_bufio_allocated_get_free_pages) {
2122                 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2123                        __func__, dm_bufio_allocated_get_free_pages);
2124                 bug = 1;
2125         }
2126
2127         if (dm_bufio_allocated_vmalloc) {
2128                 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2129                        __func__, dm_bufio_allocated_vmalloc);
2130                 bug = 1;
2131         }
2132
2133         BUG_ON(bug);
2134 }
2135
2136 module_init(dm_bufio_init)
2137 module_exit(dm_bufio_exit)
2138
2139 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2140 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2141
2142 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2143 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2144
2145 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2146 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2147
2148 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2149 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2150
2151 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2152 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2153
2154 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2155 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2156
2157 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2158 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2159
2160 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2161 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2162
2163 MODULE_AUTHOR("Mikulas Patocka <[email protected]>");
2164 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2165 MODULE_LICENSE("GPL");
This page took 0.160814 seconds and 4 git commands to generate.