]> Git Repo - linux.git/blob - drivers/md/dm-integrity.c
Merge tag 'for-6.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / drivers / md / dm-integrity.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
4  * Copyright (C) 2016-2017 Milan Broz
5  * Copyright (C) 2016-2017 Mikulas Patocka
6  *
7  * This file is released under the GPL.
8  */
9
10 #include "dm-bio-record.h"
11
12 #include <linux/compiler.h>
13 #include <linux/module.h>
14 #include <linux/device-mapper.h>
15 #include <linux/dm-io.h>
16 #include <linux/vmalloc.h>
17 #include <linux/sort.h>
18 #include <linux/rbtree.h>
19 #include <linux/delay.h>
20 #include <linux/random.h>
21 #include <linux/reboot.h>
22 #include <crypto/hash.h>
23 #include <crypto/skcipher.h>
24 #include <linux/async_tx.h>
25 #include <linux/dm-bufio.h>
26
27 #include "dm-audit.h"
28
29 #define DM_MSG_PREFIX "integrity"
30
31 #define DEFAULT_INTERLEAVE_SECTORS      32768
32 #define DEFAULT_JOURNAL_SIZE_FACTOR     7
33 #define DEFAULT_SECTORS_PER_BITMAP_BIT  32768
34 #define DEFAULT_BUFFER_SECTORS          128
35 #define DEFAULT_JOURNAL_WATERMARK       50
36 #define DEFAULT_SYNC_MSEC               10000
37 #define DEFAULT_MAX_JOURNAL_SECTORS     (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192)
38 #define MIN_LOG2_INTERLEAVE_SECTORS     3
39 #define MAX_LOG2_INTERLEAVE_SECTORS     31
40 #define METADATA_WORKQUEUE_MAX_ACTIVE   16
41 #define RECALC_SECTORS                  (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048)
42 #define RECALC_WRITE_SUPER              16
43 #define BITMAP_BLOCK_SIZE               4096    /* don't change it */
44 #define BITMAP_FLUSH_INTERVAL           (10 * HZ)
45 #define DISCARD_FILLER                  0xf6
46 #define SALT_SIZE                       16
47 #define RECHECK_POOL_SIZE               256
48
49 /*
50  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
51  * so it should not be enabled in the official kernel
52  */
53 //#define DEBUG_PRINT
54 //#define INTERNAL_VERIFY
55
56 /*
57  * On disk structures
58  */
59
60 #define SB_MAGIC                        "integrt"
61 #define SB_VERSION_1                    1
62 #define SB_VERSION_2                    2
63 #define SB_VERSION_3                    3
64 #define SB_VERSION_4                    4
65 #define SB_VERSION_5                    5
66 #define SB_VERSION_6                    6
67 #define SB_SECTORS                      8
68 #define MAX_SECTORS_PER_BLOCK           8
69
70 struct superblock {
71         __u8 magic[8];
72         __u8 version;
73         __u8 log2_interleave_sectors;
74         __le16 integrity_tag_size;
75         __le32 journal_sections;
76         __le64 provided_data_sectors;   /* userspace uses this value */
77         __le32 flags;
78         __u8 log2_sectors_per_block;
79         __u8 log2_blocks_per_bitmap_bit;
80         __u8 pad[2];
81         __le64 recalc_sector;
82         __u8 pad2[8];
83         __u8 salt[SALT_SIZE];
84 };
85
86 #define SB_FLAG_HAVE_JOURNAL_MAC        0x1
87 #define SB_FLAG_RECALCULATING           0x2
88 #define SB_FLAG_DIRTY_BITMAP            0x4
89 #define SB_FLAG_FIXED_PADDING           0x8
90 #define SB_FLAG_FIXED_HMAC              0x10
91 #define SB_FLAG_INLINE                  0x20
92
93 #define JOURNAL_ENTRY_ROUNDUP           8
94
95 typedef __le64 commit_id_t;
96 #define JOURNAL_MAC_PER_SECTOR          8
97
98 struct journal_entry {
99         union {
100                 struct {
101                         __le32 sector_lo;
102                         __le32 sector_hi;
103                 } s;
104                 __le64 sector;
105         } u;
106         commit_id_t last_bytes[];
107         /* __u8 tag[0]; */
108 };
109
110 #define journal_entry_tag(ic, je)               ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
111
112 #if BITS_PER_LONG == 64
113 #define journal_entry_set_sector(je, x)         do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
114 #else
115 #define journal_entry_set_sector(je, x)         do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
116 #endif
117 #define journal_entry_get_sector(je)            le64_to_cpu((je)->u.sector)
118 #define journal_entry_is_unused(je)             ((je)->u.s.sector_hi == cpu_to_le32(-1))
119 #define journal_entry_set_unused(je)            ((je)->u.s.sector_hi = cpu_to_le32(-1))
120 #define journal_entry_is_inprogress(je)         ((je)->u.s.sector_hi == cpu_to_le32(-2))
121 #define journal_entry_set_inprogress(je)        ((je)->u.s.sector_hi = cpu_to_le32(-2))
122
123 #define JOURNAL_BLOCK_SECTORS           8
124 #define JOURNAL_SECTOR_DATA             ((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
125 #define JOURNAL_MAC_SIZE                (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
126
127 struct journal_sector {
128         struct_group(sectors,
129                 __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
130                 __u8 mac[JOURNAL_MAC_PER_SECTOR];
131         );
132         commit_id_t commit_id;
133 };
134
135 #define MAX_TAG_SIZE                    (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
136
137 #define METADATA_PADDING_SECTORS        8
138
139 #define N_COMMIT_IDS                    4
140
141 static unsigned char prev_commit_seq(unsigned char seq)
142 {
143         return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
144 }
145
146 static unsigned char next_commit_seq(unsigned char seq)
147 {
148         return (seq + 1) % N_COMMIT_IDS;
149 }
150
151 /*
152  * In-memory structures
153  */
154
155 struct journal_node {
156         struct rb_node node;
157         sector_t sector;
158 };
159
160 struct alg_spec {
161         char *alg_string;
162         char *key_string;
163         __u8 *key;
164         unsigned int key_size;
165 };
166
167 struct dm_integrity_c {
168         struct dm_dev *dev;
169         struct dm_dev *meta_dev;
170         unsigned int tag_size;
171         __s8 log2_tag_size;
172         unsigned int tuple_size;
173         sector_t start;
174         mempool_t journal_io_mempool;
175         struct dm_io_client *io;
176         struct dm_bufio_client *bufio;
177         struct workqueue_struct *metadata_wq;
178         struct superblock *sb;
179         unsigned int journal_pages;
180         unsigned int n_bitmap_blocks;
181
182         struct page_list *journal;
183         struct page_list *journal_io;
184         struct page_list *journal_xor;
185         struct page_list *recalc_bitmap;
186         struct page_list *may_write_bitmap;
187         struct bitmap_block_status *bbs;
188         unsigned int bitmap_flush_interval;
189         int synchronous_mode;
190         struct bio_list synchronous_bios;
191         struct delayed_work bitmap_flush_work;
192
193         struct crypto_skcipher *journal_crypt;
194         struct scatterlist **journal_scatterlist;
195         struct scatterlist **journal_io_scatterlist;
196         struct skcipher_request **sk_requests;
197
198         struct crypto_shash *journal_mac;
199
200         struct journal_node *journal_tree;
201         struct rb_root journal_tree_root;
202
203         sector_t provided_data_sectors;
204
205         unsigned short journal_entry_size;
206         unsigned char journal_entries_per_sector;
207         unsigned char journal_section_entries;
208         unsigned short journal_section_sectors;
209         unsigned int journal_sections;
210         unsigned int journal_entries;
211         sector_t data_device_sectors;
212         sector_t meta_device_sectors;
213         unsigned int initial_sectors;
214         unsigned int metadata_run;
215         __s8 log2_metadata_run;
216         __u8 log2_buffer_sectors;
217         __u8 sectors_per_block;
218         __u8 log2_blocks_per_bitmap_bit;
219
220         unsigned char mode;
221
222         int failed;
223
224         struct crypto_shash *internal_hash;
225
226         struct dm_target *ti;
227
228         /* these variables are locked with endio_wait.lock */
229         struct rb_root in_progress;
230         struct list_head wait_list;
231         wait_queue_head_t endio_wait;
232         struct workqueue_struct *wait_wq;
233         struct workqueue_struct *offload_wq;
234
235         unsigned char commit_seq;
236         commit_id_t commit_ids[N_COMMIT_IDS];
237
238         unsigned int committed_section;
239         unsigned int n_committed_sections;
240
241         unsigned int uncommitted_section;
242         unsigned int n_uncommitted_sections;
243
244         unsigned int free_section;
245         unsigned char free_section_entry;
246         unsigned int free_sectors;
247
248         unsigned int free_sectors_threshold;
249
250         struct workqueue_struct *commit_wq;
251         struct work_struct commit_work;
252
253         struct workqueue_struct *writer_wq;
254         struct work_struct writer_work;
255
256         struct workqueue_struct *recalc_wq;
257         struct work_struct recalc_work;
258
259         struct bio_list flush_bio_list;
260
261         unsigned long autocommit_jiffies;
262         struct timer_list autocommit_timer;
263         unsigned int autocommit_msec;
264
265         wait_queue_head_t copy_to_journal_wait;
266
267         struct completion crypto_backoff;
268
269         bool wrote_to_journal;
270         bool journal_uptodate;
271         bool just_formatted;
272         bool recalculate_flag;
273         bool reset_recalculate_flag;
274         bool discard;
275         bool fix_padding;
276         bool fix_hmac;
277         bool legacy_recalculate;
278
279         struct alg_spec internal_hash_alg;
280         struct alg_spec journal_crypt_alg;
281         struct alg_spec journal_mac_alg;
282
283         atomic64_t number_of_mismatches;
284
285         mempool_t recheck_pool;
286         struct bio_set recheck_bios;
287         struct bio_set recalc_bios;
288
289         struct notifier_block reboot_notifier;
290 };
291
292 struct dm_integrity_range {
293         sector_t logical_sector;
294         sector_t n_sectors;
295         bool waiting;
296         union {
297                 struct rb_node node;
298                 struct {
299                         struct task_struct *task;
300                         struct list_head wait_entry;
301                 };
302         };
303 };
304
305 struct dm_integrity_io {
306         struct work_struct work;
307
308         struct dm_integrity_c *ic;
309         enum req_op op;
310         bool fua;
311
312         struct dm_integrity_range range;
313
314         sector_t metadata_block;
315         unsigned int metadata_offset;
316
317         atomic_t in_flight;
318         blk_status_t bi_status;
319
320         struct completion *completion;
321
322         struct dm_bio_details bio_details;
323
324         char *integrity_payload;
325         unsigned payload_len;
326         bool integrity_payload_from_mempool;
327         bool integrity_range_locked;
328 };
329
330 struct journal_completion {
331         struct dm_integrity_c *ic;
332         atomic_t in_flight;
333         struct completion comp;
334 };
335
336 struct journal_io {
337         struct dm_integrity_range range;
338         struct journal_completion *comp;
339 };
340
341 struct bitmap_block_status {
342         struct work_struct work;
343         struct dm_integrity_c *ic;
344         unsigned int idx;
345         unsigned long *bitmap;
346         struct bio_list bio_queue;
347         spinlock_t bio_queue_lock;
348
349 };
350
351 static struct kmem_cache *journal_io_cache;
352
353 #define JOURNAL_IO_MEMPOOL      32
354
355 #ifdef DEBUG_PRINT
356 #define DEBUG_print(x, ...)                     printk(KERN_DEBUG x, ##__VA_ARGS__)
357 #define DEBUG_bytes(bytes, len, msg, ...)       printk(KERN_DEBUG msg "%s%*ph\n", ##__VA_ARGS__, \
358                                                        len ? ": " : "", len, bytes)
359 #else
360 #define DEBUG_print(x, ...)                     do { } while (0)
361 #define DEBUG_bytes(bytes, len, msg, ...)       do { } while (0)
362 #endif
363
364 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
365 static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map);
366 static void integrity_bio_wait(struct work_struct *w);
367 static void dm_integrity_dtr(struct dm_target *ti);
368
369 static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
370 {
371         if (err == -EILSEQ)
372                 atomic64_inc(&ic->number_of_mismatches);
373         if (!cmpxchg(&ic->failed, 0, err))
374                 DMERR("Error on %s: %d", msg, err);
375 }
376
377 static int dm_integrity_failed(struct dm_integrity_c *ic)
378 {
379         return READ_ONCE(ic->failed);
380 }
381
382 static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic)
383 {
384         if (ic->legacy_recalculate)
385                 return false;
386         if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ?
387             ic->internal_hash_alg.key || ic->journal_mac_alg.key :
388             ic->internal_hash_alg.key && !ic->journal_mac_alg.key)
389                 return true;
390         return false;
391 }
392
393 static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned int i,
394                                           unsigned int j, unsigned char seq)
395 {
396         /*
397          * Xor the number with section and sector, so that if a piece of
398          * journal is written at wrong place, it is detected.
399          */
400         return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
401 }
402
403 static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
404                                 sector_t *area, sector_t *offset)
405 {
406         if (!ic->meta_dev) {
407                 __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
408                 *area = data_sector >> log2_interleave_sectors;
409                 *offset = (unsigned int)data_sector & ((1U << log2_interleave_sectors) - 1);
410         } else {
411                 *area = 0;
412                 *offset = data_sector;
413         }
414 }
415
416 #define sector_to_block(ic, n)                                          \
417 do {                                                                    \
418         BUG_ON((n) & (unsigned int)((ic)->sectors_per_block - 1));              \
419         (n) >>= (ic)->sb->log2_sectors_per_block;                       \
420 } while (0)
421
422 static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
423                                             sector_t offset, unsigned int *metadata_offset)
424 {
425         __u64 ms;
426         unsigned int mo;
427
428         ms = area << ic->sb->log2_interleave_sectors;
429         if (likely(ic->log2_metadata_run >= 0))
430                 ms += area << ic->log2_metadata_run;
431         else
432                 ms += area * ic->metadata_run;
433         ms >>= ic->log2_buffer_sectors;
434
435         sector_to_block(ic, offset);
436
437         if (likely(ic->log2_tag_size >= 0)) {
438                 ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
439                 mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
440         } else {
441                 ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
442                 mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
443         }
444         *metadata_offset = mo;
445         return ms;
446 }
447
448 static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
449 {
450         sector_t result;
451
452         if (ic->meta_dev)
453                 return offset;
454
455         result = area << ic->sb->log2_interleave_sectors;
456         if (likely(ic->log2_metadata_run >= 0))
457                 result += (area + 1) << ic->log2_metadata_run;
458         else
459                 result += (area + 1) * ic->metadata_run;
460
461         result += (sector_t)ic->initial_sectors + offset;
462         result += ic->start;
463
464         return result;
465 }
466
467 static void wraparound_section(struct dm_integrity_c *ic, unsigned int *sec_ptr)
468 {
469         if (unlikely(*sec_ptr >= ic->journal_sections))
470                 *sec_ptr -= ic->journal_sections;
471 }
472
473 static void sb_set_version(struct dm_integrity_c *ic)
474 {
475         if (ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE))
476                 ic->sb->version = SB_VERSION_6;
477         else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC))
478                 ic->sb->version = SB_VERSION_5;
479         else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
480                 ic->sb->version = SB_VERSION_4;
481         else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
482                 ic->sb->version = SB_VERSION_3;
483         else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
484                 ic->sb->version = SB_VERSION_2;
485         else
486                 ic->sb->version = SB_VERSION_1;
487 }
488
489 static int sb_mac(struct dm_integrity_c *ic, bool wr)
490 {
491         SHASH_DESC_ON_STACK(desc, ic->journal_mac);
492         int r;
493         unsigned int mac_size = crypto_shash_digestsize(ic->journal_mac);
494         __u8 *sb = (__u8 *)ic->sb;
495         __u8 *mac = sb + (1 << SECTOR_SHIFT) - mac_size;
496
497         if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT ||
498             mac_size > HASH_MAX_DIGESTSIZE) {
499                 dm_integrity_io_error(ic, "digest is too long", -EINVAL);
500                 return -EINVAL;
501         }
502
503         desc->tfm = ic->journal_mac;
504
505         if (likely(wr)) {
506                 r = crypto_shash_digest(desc, sb, mac - sb, mac);
507                 if (unlikely(r < 0)) {
508                         dm_integrity_io_error(ic, "crypto_shash_digest", r);
509                         return r;
510                 }
511         } else {
512                 __u8 actual_mac[HASH_MAX_DIGESTSIZE];
513
514                 r = crypto_shash_digest(desc, sb, mac - sb, actual_mac);
515                 if (unlikely(r < 0)) {
516                         dm_integrity_io_error(ic, "crypto_shash_digest", r);
517                         return r;
518                 }
519                 if (memcmp(mac, actual_mac, mac_size)) {
520                         dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
521                         dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
522                         return -EILSEQ;
523                 }
524         }
525
526         return 0;
527 }
528
529 static int sync_rw_sb(struct dm_integrity_c *ic, blk_opf_t opf)
530 {
531         struct dm_io_request io_req;
532         struct dm_io_region io_loc;
533         const enum req_op op = opf & REQ_OP_MASK;
534         int r;
535
536         io_req.bi_opf = opf;
537         io_req.mem.type = DM_IO_KMEM;
538         io_req.mem.ptr.addr = ic->sb;
539         io_req.notify.fn = NULL;
540         io_req.client = ic->io;
541         io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
542         io_loc.sector = ic->start;
543         io_loc.count = SB_SECTORS;
544
545         if (op == REQ_OP_WRITE) {
546                 sb_set_version(ic);
547                 if (ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
548                         r = sb_mac(ic, true);
549                         if (unlikely(r))
550                                 return r;
551                 }
552         }
553
554         r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
555         if (unlikely(r))
556                 return r;
557
558         if (op == REQ_OP_READ) {
559                 if (ic->mode != 'R' && ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
560                         r = sb_mac(ic, false);
561                         if (unlikely(r))
562                                 return r;
563                 }
564         }
565
566         return 0;
567 }
568
569 #define BITMAP_OP_TEST_ALL_SET          0
570 #define BITMAP_OP_TEST_ALL_CLEAR        1
571 #define BITMAP_OP_SET                   2
572 #define BITMAP_OP_CLEAR                 3
573
574 static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
575                             sector_t sector, sector_t n_sectors, int mode)
576 {
577         unsigned long bit, end_bit, this_end_bit, page, end_page;
578         unsigned long *data;
579
580         if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
581                 DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
582                         sector,
583                         n_sectors,
584                         ic->sb->log2_sectors_per_block,
585                         ic->log2_blocks_per_bitmap_bit,
586                         mode);
587                 BUG();
588         }
589
590         if (unlikely(!n_sectors))
591                 return true;
592
593         bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
594         end_bit = (sector + n_sectors - 1) >>
595                 (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
596
597         page = bit / (PAGE_SIZE * 8);
598         bit %= PAGE_SIZE * 8;
599
600         end_page = end_bit / (PAGE_SIZE * 8);
601         end_bit %= PAGE_SIZE * 8;
602
603 repeat:
604         if (page < end_page)
605                 this_end_bit = PAGE_SIZE * 8 - 1;
606         else
607                 this_end_bit = end_bit;
608
609         data = lowmem_page_address(bitmap[page].page);
610
611         if (mode == BITMAP_OP_TEST_ALL_SET) {
612                 while (bit <= this_end_bit) {
613                         if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
614                                 do {
615                                         if (data[bit / BITS_PER_LONG] != -1)
616                                                 return false;
617                                         bit += BITS_PER_LONG;
618                                 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
619                                 continue;
620                         }
621                         if (!test_bit(bit, data))
622                                 return false;
623                         bit++;
624                 }
625         } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
626                 while (bit <= this_end_bit) {
627                         if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
628                                 do {
629                                         if (data[bit / BITS_PER_LONG] != 0)
630                                                 return false;
631                                         bit += BITS_PER_LONG;
632                                 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
633                                 continue;
634                         }
635                         if (test_bit(bit, data))
636                                 return false;
637                         bit++;
638                 }
639         } else if (mode == BITMAP_OP_SET) {
640                 while (bit <= this_end_bit) {
641                         if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
642                                 do {
643                                         data[bit / BITS_PER_LONG] = -1;
644                                         bit += BITS_PER_LONG;
645                                 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
646                                 continue;
647                         }
648                         __set_bit(bit, data);
649                         bit++;
650                 }
651         } else if (mode == BITMAP_OP_CLEAR) {
652                 if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
653                         clear_page(data);
654                 else {
655                         while (bit <= this_end_bit) {
656                                 if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
657                                         do {
658                                                 data[bit / BITS_PER_LONG] = 0;
659                                                 bit += BITS_PER_LONG;
660                                         } while (this_end_bit >= bit + BITS_PER_LONG - 1);
661                                         continue;
662                                 }
663                                 __clear_bit(bit, data);
664                                 bit++;
665                         }
666                 }
667         } else {
668                 BUG();
669         }
670
671         if (unlikely(page < end_page)) {
672                 bit = 0;
673                 page++;
674                 goto repeat;
675         }
676
677         return true;
678 }
679
680 static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
681 {
682         unsigned int n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
683         unsigned int i;
684
685         for (i = 0; i < n_bitmap_pages; i++) {
686                 unsigned long *dst_data = lowmem_page_address(dst[i].page);
687                 unsigned long *src_data = lowmem_page_address(src[i].page);
688
689                 copy_page(dst_data, src_data);
690         }
691 }
692
693 static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
694 {
695         unsigned int bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
696         unsigned int bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
697
698         BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
699         return &ic->bbs[bitmap_block];
700 }
701
702 static void access_journal_check(struct dm_integrity_c *ic, unsigned int section, unsigned int offset,
703                                  bool e, const char *function)
704 {
705 #if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
706         unsigned int limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
707
708         if (unlikely(section >= ic->journal_sections) ||
709             unlikely(offset >= limit)) {
710                 DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
711                        function, section, offset, ic->journal_sections, limit);
712                 BUG();
713         }
714 #endif
715 }
716
717 static void page_list_location(struct dm_integrity_c *ic, unsigned int section, unsigned int offset,
718                                unsigned int *pl_index, unsigned int *pl_offset)
719 {
720         unsigned int sector;
721
722         access_journal_check(ic, section, offset, false, "page_list_location");
723
724         sector = section * ic->journal_section_sectors + offset;
725
726         *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
727         *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
728 }
729
730 static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
731                                                unsigned int section, unsigned int offset, unsigned int *n_sectors)
732 {
733         unsigned int pl_index, pl_offset;
734         char *va;
735
736         page_list_location(ic, section, offset, &pl_index, &pl_offset);
737
738         if (n_sectors)
739                 *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
740
741         va = lowmem_page_address(pl[pl_index].page);
742
743         return (struct journal_sector *)(va + pl_offset);
744 }
745
746 static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned int section, unsigned int offset)
747 {
748         return access_page_list(ic, ic->journal, section, offset, NULL);
749 }
750
751 static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned int section, unsigned int n)
752 {
753         unsigned int rel_sector, offset;
754         struct journal_sector *js;
755
756         access_journal_check(ic, section, n, true, "access_journal_entry");
757
758         rel_sector = n % JOURNAL_BLOCK_SECTORS;
759         offset = n / JOURNAL_BLOCK_SECTORS;
760
761         js = access_journal(ic, section, rel_sector);
762         return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
763 }
764
765 static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned int section, unsigned int n)
766 {
767         n <<= ic->sb->log2_sectors_per_block;
768
769         n += JOURNAL_BLOCK_SECTORS;
770
771         access_journal_check(ic, section, n, false, "access_journal_data");
772
773         return access_journal(ic, section, n);
774 }
775
776 static void section_mac(struct dm_integrity_c *ic, unsigned int section, __u8 result[JOURNAL_MAC_SIZE])
777 {
778         SHASH_DESC_ON_STACK(desc, ic->journal_mac);
779         int r;
780         unsigned int j, size;
781
782         desc->tfm = ic->journal_mac;
783
784         r = crypto_shash_init(desc);
785         if (unlikely(r < 0)) {
786                 dm_integrity_io_error(ic, "crypto_shash_init", r);
787                 goto err;
788         }
789
790         if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
791                 __le64 section_le;
792
793                 r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE);
794                 if (unlikely(r < 0)) {
795                         dm_integrity_io_error(ic, "crypto_shash_update", r);
796                         goto err;
797                 }
798
799                 section_le = cpu_to_le64(section);
800                 r = crypto_shash_update(desc, (__u8 *)&section_le, sizeof(section_le));
801                 if (unlikely(r < 0)) {
802                         dm_integrity_io_error(ic, "crypto_shash_update", r);
803                         goto err;
804                 }
805         }
806
807         for (j = 0; j < ic->journal_section_entries; j++) {
808                 struct journal_entry *je = access_journal_entry(ic, section, j);
809
810                 r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof(je->u.sector));
811                 if (unlikely(r < 0)) {
812                         dm_integrity_io_error(ic, "crypto_shash_update", r);
813                         goto err;
814                 }
815         }
816
817         size = crypto_shash_digestsize(ic->journal_mac);
818
819         if (likely(size <= JOURNAL_MAC_SIZE)) {
820                 r = crypto_shash_final(desc, result);
821                 if (unlikely(r < 0)) {
822                         dm_integrity_io_error(ic, "crypto_shash_final", r);
823                         goto err;
824                 }
825                 memset(result + size, 0, JOURNAL_MAC_SIZE - size);
826         } else {
827                 __u8 digest[HASH_MAX_DIGESTSIZE];
828
829                 if (WARN_ON(size > sizeof(digest))) {
830                         dm_integrity_io_error(ic, "digest_size", -EINVAL);
831                         goto err;
832                 }
833                 r = crypto_shash_final(desc, digest);
834                 if (unlikely(r < 0)) {
835                         dm_integrity_io_error(ic, "crypto_shash_final", r);
836                         goto err;
837                 }
838                 memcpy(result, digest, JOURNAL_MAC_SIZE);
839         }
840
841         return;
842 err:
843         memset(result, 0, JOURNAL_MAC_SIZE);
844 }
845
846 static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool wr)
847 {
848         __u8 result[JOURNAL_MAC_SIZE];
849         unsigned int j;
850
851         if (!ic->journal_mac)
852                 return;
853
854         section_mac(ic, section, result);
855
856         for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
857                 struct journal_sector *js = access_journal(ic, section, j);
858
859                 if (likely(wr))
860                         memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
861                 else {
862                         if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
863                                 dm_integrity_io_error(ic, "journal mac", -EILSEQ);
864                                 dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0);
865                         }
866                 }
867         }
868 }
869
870 static void complete_journal_op(void *context)
871 {
872         struct journal_completion *comp = context;
873
874         BUG_ON(!atomic_read(&comp->in_flight));
875         if (likely(atomic_dec_and_test(&comp->in_flight)))
876                 complete(&comp->comp);
877 }
878
879 static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned int section,
880                         unsigned int n_sections, struct journal_completion *comp)
881 {
882         struct async_submit_ctl submit;
883         size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
884         unsigned int pl_index, pl_offset, section_index;
885         struct page_list *source_pl, *target_pl;
886
887         if (likely(encrypt)) {
888                 source_pl = ic->journal;
889                 target_pl = ic->journal_io;
890         } else {
891                 source_pl = ic->journal_io;
892                 target_pl = ic->journal;
893         }
894
895         page_list_location(ic, section, 0, &pl_index, &pl_offset);
896
897         atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
898
899         init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
900
901         section_index = pl_index;
902
903         do {
904                 size_t this_step;
905                 struct page *src_pages[2];
906                 struct page *dst_page;
907
908                 while (unlikely(pl_index == section_index)) {
909                         unsigned int dummy;
910
911                         if (likely(encrypt))
912                                 rw_section_mac(ic, section, true);
913                         section++;
914                         n_sections--;
915                         if (!n_sections)
916                                 break;
917                         page_list_location(ic, section, 0, &section_index, &dummy);
918                 }
919
920                 this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
921                 dst_page = target_pl[pl_index].page;
922                 src_pages[0] = source_pl[pl_index].page;
923                 src_pages[1] = ic->journal_xor[pl_index].page;
924
925                 async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
926
927                 pl_index++;
928                 pl_offset = 0;
929                 n_bytes -= this_step;
930         } while (n_bytes);
931
932         BUG_ON(n_sections);
933
934         async_tx_issue_pending_all();
935 }
936
937 static void complete_journal_encrypt(void *data, int err)
938 {
939         struct journal_completion *comp = data;
940
941         if (unlikely(err)) {
942                 if (likely(err == -EINPROGRESS)) {
943                         complete(&comp->ic->crypto_backoff);
944                         return;
945                 }
946                 dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
947         }
948         complete_journal_op(comp);
949 }
950
951 static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
952 {
953         int r;
954
955         skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
956                                       complete_journal_encrypt, comp);
957         if (likely(encrypt))
958                 r = crypto_skcipher_encrypt(req);
959         else
960                 r = crypto_skcipher_decrypt(req);
961         if (likely(!r))
962                 return false;
963         if (likely(r == -EINPROGRESS))
964                 return true;
965         if (likely(r == -EBUSY)) {
966                 wait_for_completion(&comp->ic->crypto_backoff);
967                 reinit_completion(&comp->ic->crypto_backoff);
968                 return true;
969         }
970         dm_integrity_io_error(comp->ic, "encrypt", r);
971         return false;
972 }
973
974 static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned int section,
975                           unsigned int n_sections, struct journal_completion *comp)
976 {
977         struct scatterlist **source_sg;
978         struct scatterlist **target_sg;
979
980         atomic_add(2, &comp->in_flight);
981
982         if (likely(encrypt)) {
983                 source_sg = ic->journal_scatterlist;
984                 target_sg = ic->journal_io_scatterlist;
985         } else {
986                 source_sg = ic->journal_io_scatterlist;
987                 target_sg = ic->journal_scatterlist;
988         }
989
990         do {
991                 struct skcipher_request *req;
992                 unsigned int ivsize;
993                 char *iv;
994
995                 if (likely(encrypt))
996                         rw_section_mac(ic, section, true);
997
998                 req = ic->sk_requests[section];
999                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
1000                 iv = req->iv;
1001
1002                 memcpy(iv, iv + ivsize, ivsize);
1003
1004                 req->src = source_sg[section];
1005                 req->dst = target_sg[section];
1006
1007                 if (unlikely(do_crypt(encrypt, req, comp)))
1008                         atomic_inc(&comp->in_flight);
1009
1010                 section++;
1011                 n_sections--;
1012         } while (n_sections);
1013
1014         atomic_dec(&comp->in_flight);
1015         complete_journal_op(comp);
1016 }
1017
1018 static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned int section,
1019                             unsigned int n_sections, struct journal_completion *comp)
1020 {
1021         if (ic->journal_xor)
1022                 return xor_journal(ic, encrypt, section, n_sections, comp);
1023         else
1024                 return crypt_journal(ic, encrypt, section, n_sections, comp);
1025 }
1026
1027 static void complete_journal_io(unsigned long error, void *context)
1028 {
1029         struct journal_completion *comp = context;
1030
1031         if (unlikely(error != 0))
1032                 dm_integrity_io_error(comp->ic, "writing journal", -EIO);
1033         complete_journal_op(comp);
1034 }
1035
1036 static void rw_journal_sectors(struct dm_integrity_c *ic, blk_opf_t opf,
1037                                unsigned int sector, unsigned int n_sectors,
1038                                struct journal_completion *comp)
1039 {
1040         struct dm_io_request io_req;
1041         struct dm_io_region io_loc;
1042         unsigned int pl_index, pl_offset;
1043         int r;
1044
1045         if (unlikely(dm_integrity_failed(ic))) {
1046                 if (comp)
1047                         complete_journal_io(-1UL, comp);
1048                 return;
1049         }
1050
1051         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
1052         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
1053
1054         io_req.bi_opf = opf;
1055         io_req.mem.type = DM_IO_PAGE_LIST;
1056         if (ic->journal_io)
1057                 io_req.mem.ptr.pl = &ic->journal_io[pl_index];
1058         else
1059                 io_req.mem.ptr.pl = &ic->journal[pl_index];
1060         io_req.mem.offset = pl_offset;
1061         if (likely(comp != NULL)) {
1062                 io_req.notify.fn = complete_journal_io;
1063                 io_req.notify.context = comp;
1064         } else {
1065                 io_req.notify.fn = NULL;
1066         }
1067         io_req.client = ic->io;
1068         io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
1069         io_loc.sector = ic->start + SB_SECTORS + sector;
1070         io_loc.count = n_sectors;
1071
1072         r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
1073         if (unlikely(r)) {
1074                 dm_integrity_io_error(ic, (opf & REQ_OP_MASK) == REQ_OP_READ ?
1075                                       "reading journal" : "writing journal", r);
1076                 if (comp) {
1077                         WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
1078                         complete_journal_io(-1UL, comp);
1079                 }
1080         }
1081 }
1082
1083 static void rw_journal(struct dm_integrity_c *ic, blk_opf_t opf,
1084                        unsigned int section, unsigned int n_sections,
1085                        struct journal_completion *comp)
1086 {
1087         unsigned int sector, n_sectors;
1088
1089         sector = section * ic->journal_section_sectors;
1090         n_sectors = n_sections * ic->journal_section_sectors;
1091
1092         rw_journal_sectors(ic, opf, sector, n_sectors, comp);
1093 }
1094
1095 static void write_journal(struct dm_integrity_c *ic, unsigned int commit_start, unsigned int commit_sections)
1096 {
1097         struct journal_completion io_comp;
1098         struct journal_completion crypt_comp_1;
1099         struct journal_completion crypt_comp_2;
1100         unsigned int i;
1101
1102         io_comp.ic = ic;
1103         init_completion(&io_comp.comp);
1104
1105         if (commit_start + commit_sections <= ic->journal_sections) {
1106                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
1107                 if (ic->journal_io) {
1108                         crypt_comp_1.ic = ic;
1109                         init_completion(&crypt_comp_1.comp);
1110                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
1111                         encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
1112                         wait_for_completion_io(&crypt_comp_1.comp);
1113                 } else {
1114                         for (i = 0; i < commit_sections; i++)
1115                                 rw_section_mac(ic, commit_start + i, true);
1116                 }
1117                 rw_journal(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, commit_start,
1118                            commit_sections, &io_comp);
1119         } else {
1120                 unsigned int to_end;
1121
1122                 io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
1123                 to_end = ic->journal_sections - commit_start;
1124                 if (ic->journal_io) {
1125                         crypt_comp_1.ic = ic;
1126                         init_completion(&crypt_comp_1.comp);
1127                         crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
1128                         encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
1129                         if (try_wait_for_completion(&crypt_comp_1.comp)) {
1130                                 rw_journal(ic, REQ_OP_WRITE | REQ_FUA,
1131                                            commit_start, to_end, &io_comp);
1132                                 reinit_completion(&crypt_comp_1.comp);
1133                                 crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
1134                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
1135                                 wait_for_completion_io(&crypt_comp_1.comp);
1136                         } else {
1137                                 crypt_comp_2.ic = ic;
1138                                 init_completion(&crypt_comp_2.comp);
1139                                 crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
1140                                 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
1141                                 wait_for_completion_io(&crypt_comp_1.comp);
1142                                 rw_journal(ic, REQ_OP_WRITE | REQ_FUA, commit_start, to_end, &io_comp);
1143                                 wait_for_completion_io(&crypt_comp_2.comp);
1144                         }
1145                 } else {
1146                         for (i = 0; i < to_end; i++)
1147                                 rw_section_mac(ic, commit_start + i, true);
1148                         rw_journal(ic, REQ_OP_WRITE | REQ_FUA, commit_start, to_end, &io_comp);
1149                         for (i = 0; i < commit_sections - to_end; i++)
1150                                 rw_section_mac(ic, i, true);
1151                 }
1152                 rw_journal(ic, REQ_OP_WRITE | REQ_FUA, 0, commit_sections - to_end, &io_comp);
1153         }
1154
1155         wait_for_completion_io(&io_comp.comp);
1156 }
1157
1158 static void copy_from_journal(struct dm_integrity_c *ic, unsigned int section, unsigned int offset,
1159                               unsigned int n_sectors, sector_t target, io_notify_fn fn, void *data)
1160 {
1161         struct dm_io_request io_req;
1162         struct dm_io_region io_loc;
1163         int r;
1164         unsigned int sector, pl_index, pl_offset;
1165
1166         BUG_ON((target | n_sectors | offset) & (unsigned int)(ic->sectors_per_block - 1));
1167
1168         if (unlikely(dm_integrity_failed(ic))) {
1169                 fn(-1UL, data);
1170                 return;
1171         }
1172
1173         sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
1174
1175         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
1176         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
1177
1178         io_req.bi_opf = REQ_OP_WRITE;
1179         io_req.mem.type = DM_IO_PAGE_LIST;
1180         io_req.mem.ptr.pl = &ic->journal[pl_index];
1181         io_req.mem.offset = pl_offset;
1182         io_req.notify.fn = fn;
1183         io_req.notify.context = data;
1184         io_req.client = ic->io;
1185         io_loc.bdev = ic->dev->bdev;
1186         io_loc.sector = target;
1187         io_loc.count = n_sectors;
1188
1189         r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
1190         if (unlikely(r)) {
1191                 WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
1192                 fn(-1UL, data);
1193         }
1194 }
1195
1196 static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
1197 {
1198         return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
1199                range1->logical_sector + range1->n_sectors > range2->logical_sector;
1200 }
1201
1202 static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
1203 {
1204         struct rb_node **n = &ic->in_progress.rb_node;
1205         struct rb_node *parent;
1206
1207         BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned int)(ic->sectors_per_block - 1));
1208
1209         if (likely(check_waiting)) {
1210                 struct dm_integrity_range *range;
1211
1212                 list_for_each_entry(range, &ic->wait_list, wait_entry) {
1213                         if (unlikely(ranges_overlap(range, new_range)))
1214                                 return false;
1215                 }
1216         }
1217
1218         parent = NULL;
1219
1220         while (*n) {
1221                 struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
1222
1223                 parent = *n;
1224                 if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector)
1225                         n = &range->node.rb_left;
1226                 else if (new_range->logical_sector >= range->logical_sector + range->n_sectors)
1227                         n = &range->node.rb_right;
1228                 else
1229                         return false;
1230         }
1231
1232         rb_link_node(&new_range->node, parent, n);
1233         rb_insert_color(&new_range->node, &ic->in_progress);
1234
1235         return true;
1236 }
1237
1238 static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
1239 {
1240         rb_erase(&range->node, &ic->in_progress);
1241         while (unlikely(!list_empty(&ic->wait_list))) {
1242                 struct dm_integrity_range *last_range =
1243                         list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
1244                 struct task_struct *last_range_task;
1245
1246                 last_range_task = last_range->task;
1247                 list_del(&last_range->wait_entry);
1248                 if (!add_new_range(ic, last_range, false)) {
1249                         last_range->task = last_range_task;
1250                         list_add(&last_range->wait_entry, &ic->wait_list);
1251                         break;
1252                 }
1253                 last_range->waiting = false;
1254                 wake_up_process(last_range_task);
1255         }
1256 }
1257
1258 static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
1259 {
1260         unsigned long flags;
1261
1262         spin_lock_irqsave(&ic->endio_wait.lock, flags);
1263         remove_range_unlocked(ic, range);
1264         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1265 }
1266
1267 static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
1268 {
1269         new_range->waiting = true;
1270         list_add_tail(&new_range->wait_entry, &ic->wait_list);
1271         new_range->task = current;
1272         do {
1273                 __set_current_state(TASK_UNINTERRUPTIBLE);
1274                 spin_unlock_irq(&ic->endio_wait.lock);
1275                 io_schedule();
1276                 spin_lock_irq(&ic->endio_wait.lock);
1277         } while (unlikely(new_range->waiting));
1278 }
1279
1280 static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
1281 {
1282         if (unlikely(!add_new_range(ic, new_range, true)))
1283                 wait_and_add_new_range(ic, new_range);
1284 }
1285
1286 static void init_journal_node(struct journal_node *node)
1287 {
1288         RB_CLEAR_NODE(&node->node);
1289         node->sector = (sector_t)-1;
1290 }
1291
1292 static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
1293 {
1294         struct rb_node **link;
1295         struct rb_node *parent;
1296
1297         node->sector = sector;
1298         BUG_ON(!RB_EMPTY_NODE(&node->node));
1299
1300         link = &ic->journal_tree_root.rb_node;
1301         parent = NULL;
1302
1303         while (*link) {
1304                 struct journal_node *j;
1305
1306                 parent = *link;
1307                 j = container_of(parent, struct journal_node, node);
1308                 if (sector < j->sector)
1309                         link = &j->node.rb_left;
1310                 else
1311                         link = &j->node.rb_right;
1312         }
1313
1314         rb_link_node(&node->node, parent, link);
1315         rb_insert_color(&node->node, &ic->journal_tree_root);
1316 }
1317
1318 static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
1319 {
1320         BUG_ON(RB_EMPTY_NODE(&node->node));
1321         rb_erase(&node->node, &ic->journal_tree_root);
1322         init_journal_node(node);
1323 }
1324
1325 #define NOT_FOUND       (-1U)
1326
1327 static unsigned int find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
1328 {
1329         struct rb_node *n = ic->journal_tree_root.rb_node;
1330         unsigned int found = NOT_FOUND;
1331
1332         *next_sector = (sector_t)-1;
1333         while (n) {
1334                 struct journal_node *j = container_of(n, struct journal_node, node);
1335
1336                 if (sector == j->sector)
1337                         found = j - ic->journal_tree;
1338
1339                 if (sector < j->sector) {
1340                         *next_sector = j->sector;
1341                         n = j->node.rb_left;
1342                 } else
1343                         n = j->node.rb_right;
1344         }
1345
1346         return found;
1347 }
1348
1349 static bool test_journal_node(struct dm_integrity_c *ic, unsigned int pos, sector_t sector)
1350 {
1351         struct journal_node *node, *next_node;
1352         struct rb_node *next;
1353
1354         if (unlikely(pos >= ic->journal_entries))
1355                 return false;
1356         node = &ic->journal_tree[pos];
1357         if (unlikely(RB_EMPTY_NODE(&node->node)))
1358                 return false;
1359         if (unlikely(node->sector != sector))
1360                 return false;
1361
1362         next = rb_next(&node->node);
1363         if (unlikely(!next))
1364                 return true;
1365
1366         next_node = container_of(next, struct journal_node, node);
1367         return next_node->sector != sector;
1368 }
1369
1370 static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
1371 {
1372         struct rb_node *next;
1373         struct journal_node *next_node;
1374         unsigned int next_section;
1375
1376         BUG_ON(RB_EMPTY_NODE(&node->node));
1377
1378         next = rb_next(&node->node);
1379         if (unlikely(!next))
1380                 return false;
1381
1382         next_node = container_of(next, struct journal_node, node);
1383
1384         if (next_node->sector != node->sector)
1385                 return false;
1386
1387         next_section = (unsigned int)(next_node - ic->journal_tree) / ic->journal_section_entries;
1388         if (next_section >= ic->committed_section &&
1389             next_section < ic->committed_section + ic->n_committed_sections)
1390                 return true;
1391         if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
1392                 return true;
1393
1394         return false;
1395 }
1396
1397 #define TAG_READ        0
1398 #define TAG_WRITE       1
1399 #define TAG_CMP         2
1400
1401 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
1402                                unsigned int *metadata_offset, unsigned int total_size, int op)
1403 {
1404 #define MAY_BE_FILLER           1
1405 #define MAY_BE_HASH             2
1406         unsigned int hash_offset = 0;
1407         unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
1408
1409         do {
1410                 unsigned char *data, *dp;
1411                 struct dm_buffer *b;
1412                 unsigned int to_copy;
1413                 int r;
1414
1415                 r = dm_integrity_failed(ic);
1416                 if (unlikely(r))
1417                         return r;
1418
1419                 data = dm_bufio_read(ic->bufio, *metadata_block, &b);
1420                 if (IS_ERR(data))
1421                         return PTR_ERR(data);
1422
1423                 to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
1424                 dp = data + *metadata_offset;
1425                 if (op == TAG_READ) {
1426                         memcpy(tag, dp, to_copy);
1427                 } else if (op == TAG_WRITE) {
1428                         if (memcmp(dp, tag, to_copy)) {
1429                                 memcpy(dp, tag, to_copy);
1430                                 dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
1431                         }
1432                 } else {
1433                         /* e.g.: op == TAG_CMP */
1434
1435                         if (likely(is_power_of_2(ic->tag_size))) {
1436                                 if (unlikely(memcmp(dp, tag, to_copy)))
1437                                         if (unlikely(!ic->discard) ||
1438                                             unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
1439                                                 goto thorough_test;
1440                                 }
1441                         } else {
1442                                 unsigned int i, ts;
1443 thorough_test:
1444                                 ts = total_size;
1445
1446                                 for (i = 0; i < to_copy; i++, ts--) {
1447                                         if (unlikely(dp[i] != tag[i]))
1448                                                 may_be &= ~MAY_BE_HASH;
1449                                         if (likely(dp[i] != DISCARD_FILLER))
1450                                                 may_be &= ~MAY_BE_FILLER;
1451                                         hash_offset++;
1452                                         if (unlikely(hash_offset == ic->tag_size)) {
1453                                                 if (unlikely(!may_be)) {
1454                                                         dm_bufio_release(b);
1455                                                         return ts;
1456                                                 }
1457                                                 hash_offset = 0;
1458                                                 may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
1459                                         }
1460                                 }
1461                         }
1462                 }
1463                 dm_bufio_release(b);
1464
1465                 tag += to_copy;
1466                 *metadata_offset += to_copy;
1467                 if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
1468                         (*metadata_block)++;
1469                         *metadata_offset = 0;
1470                 }
1471
1472                 if (unlikely(!is_power_of_2(ic->tag_size)))
1473                         hash_offset = (hash_offset + to_copy) % ic->tag_size;
1474
1475                 total_size -= to_copy;
1476         } while (unlikely(total_size));
1477
1478         return 0;
1479 #undef MAY_BE_FILLER
1480 #undef MAY_BE_HASH
1481 }
1482
1483 struct flush_request {
1484         struct dm_io_request io_req;
1485         struct dm_io_region io_reg;
1486         struct dm_integrity_c *ic;
1487         struct completion comp;
1488 };
1489
1490 static void flush_notify(unsigned long error, void *fr_)
1491 {
1492         struct flush_request *fr = fr_;
1493
1494         if (unlikely(error != 0))
1495                 dm_integrity_io_error(fr->ic, "flushing disk cache", -EIO);
1496         complete(&fr->comp);
1497 }
1498
1499 static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_data)
1500 {
1501         int r;
1502         struct flush_request fr;
1503
1504         if (!ic->meta_dev)
1505                 flush_data = false;
1506         if (flush_data) {
1507                 fr.io_req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1508                 fr.io_req.mem.type = DM_IO_KMEM;
1509                 fr.io_req.mem.ptr.addr = NULL;
1510                 fr.io_req.notify.fn = flush_notify;
1511                 fr.io_req.notify.context = &fr;
1512                 fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio);
1513                 fr.io_reg.bdev = ic->dev->bdev;
1514                 fr.io_reg.sector = 0;
1515                 fr.io_reg.count = 0;
1516                 fr.ic = ic;
1517                 init_completion(&fr.comp);
1518                 r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL, IOPRIO_DEFAULT);
1519                 BUG_ON(r);
1520         }
1521
1522         r = dm_bufio_write_dirty_buffers(ic->bufio);
1523         if (unlikely(r))
1524                 dm_integrity_io_error(ic, "writing tags", r);
1525
1526         if (flush_data)
1527                 wait_for_completion(&fr.comp);
1528 }
1529
1530 static void sleep_on_endio_wait(struct dm_integrity_c *ic)
1531 {
1532         DECLARE_WAITQUEUE(wait, current);
1533
1534         __add_wait_queue(&ic->endio_wait, &wait);
1535         __set_current_state(TASK_UNINTERRUPTIBLE);
1536         spin_unlock_irq(&ic->endio_wait.lock);
1537         io_schedule();
1538         spin_lock_irq(&ic->endio_wait.lock);
1539         __remove_wait_queue(&ic->endio_wait, &wait);
1540 }
1541
1542 static void autocommit_fn(struct timer_list *t)
1543 {
1544         struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer);
1545
1546         if (likely(!dm_integrity_failed(ic)))
1547                 queue_work(ic->commit_wq, &ic->commit_work);
1548 }
1549
1550 static void schedule_autocommit(struct dm_integrity_c *ic)
1551 {
1552         if (!timer_pending(&ic->autocommit_timer))
1553                 mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
1554 }
1555
1556 static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1557 {
1558         struct bio *bio;
1559         unsigned long flags;
1560
1561         spin_lock_irqsave(&ic->endio_wait.lock, flags);
1562         bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1563         bio_list_add(&ic->flush_bio_list, bio);
1564         spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1565
1566         queue_work(ic->commit_wq, &ic->commit_work);
1567 }
1568
1569 static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
1570 {
1571         int r;
1572
1573         r = dm_integrity_failed(ic);
1574         if (unlikely(r) && !bio->bi_status)
1575                 bio->bi_status = errno_to_blk_status(r);
1576         if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
1577                 unsigned long flags;
1578
1579                 spin_lock_irqsave(&ic->endio_wait.lock, flags);
1580                 bio_list_add(&ic->synchronous_bios, bio);
1581                 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
1582                 spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1583                 return;
1584         }
1585         bio_endio(bio);
1586 }
1587
1588 static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1589 {
1590         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1591
1592         if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
1593                 submit_flush_bio(ic, dio);
1594         else
1595                 do_endio(ic, bio);
1596 }
1597
1598 static void dec_in_flight(struct dm_integrity_io *dio)
1599 {
1600         if (atomic_dec_and_test(&dio->in_flight)) {
1601                 struct dm_integrity_c *ic = dio->ic;
1602                 struct bio *bio;
1603
1604                 remove_range(ic, &dio->range);
1605
1606                 if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))
1607                         schedule_autocommit(ic);
1608
1609                 bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1610                 if (unlikely(dio->bi_status) && !bio->bi_status)
1611                         bio->bi_status = dio->bi_status;
1612                 if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
1613                         dio->range.logical_sector += dio->range.n_sectors;
1614                         bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
1615                         INIT_WORK(&dio->work, integrity_bio_wait);
1616                         queue_work(ic->offload_wq, &dio->work);
1617                         return;
1618                 }
1619                 do_endio_flush(ic, dio);
1620         }
1621 }
1622
1623 static void integrity_end_io(struct bio *bio)
1624 {
1625         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1626
1627         dm_bio_restore(&dio->bio_details, bio);
1628         if (bio->bi_integrity)
1629                 bio->bi_opf |= REQ_INTEGRITY;
1630
1631         if (dio->completion)
1632                 complete(dio->completion);
1633
1634         dec_in_flight(dio);
1635 }
1636
1637 static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
1638                                       const char *data, char *result)
1639 {
1640         __le64 sector_le = cpu_to_le64(sector);
1641         SHASH_DESC_ON_STACK(req, ic->internal_hash);
1642         int r;
1643         unsigned int digest_size;
1644
1645         req->tfm = ic->internal_hash;
1646
1647         r = crypto_shash_init(req);
1648         if (unlikely(r < 0)) {
1649                 dm_integrity_io_error(ic, "crypto_shash_init", r);
1650                 goto failed;
1651         }
1652
1653         if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
1654                 r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE);
1655                 if (unlikely(r < 0)) {
1656                         dm_integrity_io_error(ic, "crypto_shash_update", r);
1657                         goto failed;
1658                 }
1659         }
1660
1661         r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof(sector_le));
1662         if (unlikely(r < 0)) {
1663                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1664                 goto failed;
1665         }
1666
1667         r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
1668         if (unlikely(r < 0)) {
1669                 dm_integrity_io_error(ic, "crypto_shash_update", r);
1670                 goto failed;
1671         }
1672
1673         r = crypto_shash_final(req, result);
1674         if (unlikely(r < 0)) {
1675                 dm_integrity_io_error(ic, "crypto_shash_final", r);
1676                 goto failed;
1677         }
1678
1679         digest_size = crypto_shash_digestsize(ic->internal_hash);
1680         if (unlikely(digest_size < ic->tag_size))
1681                 memset(result + digest_size, 0, ic->tag_size - digest_size);
1682
1683         return;
1684
1685 failed:
1686         /* this shouldn't happen anyway, the hash functions have no reason to fail */
1687         get_random_bytes(result, ic->tag_size);
1688 }
1689
1690 static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum)
1691 {
1692         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1693         struct dm_integrity_c *ic = dio->ic;
1694         struct bvec_iter iter;
1695         struct bio_vec bv;
1696         sector_t sector, logical_sector, area, offset;
1697         struct page *page;
1698
1699         get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1700         dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset,
1701                                                              &dio->metadata_offset);
1702         sector = get_data_sector(ic, area, offset);
1703         logical_sector = dio->range.logical_sector;
1704
1705         page = mempool_alloc(&ic->recheck_pool, GFP_NOIO);
1706
1707         __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
1708                 unsigned pos = 0;
1709
1710                 do {
1711                         sector_t alignment;
1712                         char *mem;
1713                         char *buffer = page_to_virt(page);
1714                         int r;
1715                         struct dm_io_request io_req;
1716                         struct dm_io_region io_loc;
1717                         io_req.bi_opf = REQ_OP_READ;
1718                         io_req.mem.type = DM_IO_KMEM;
1719                         io_req.mem.ptr.addr = buffer;
1720                         io_req.notify.fn = NULL;
1721                         io_req.client = ic->io;
1722                         io_loc.bdev = ic->dev->bdev;
1723                         io_loc.sector = sector;
1724                         io_loc.count = ic->sectors_per_block;
1725
1726                         /* Align the bio to logical block size */
1727                         alignment = dio->range.logical_sector | bio_sectors(bio) | (PAGE_SIZE >> SECTOR_SHIFT);
1728                         alignment &= -alignment;
1729                         io_loc.sector = round_down(io_loc.sector, alignment);
1730                         io_loc.count += sector - io_loc.sector;
1731                         buffer += (sector - io_loc.sector) << SECTOR_SHIFT;
1732                         io_loc.count = round_up(io_loc.count, alignment);
1733
1734                         r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
1735                         if (unlikely(r)) {
1736                                 dio->bi_status = errno_to_blk_status(r);
1737                                 goto free_ret;
1738                         }
1739
1740                         integrity_sector_checksum(ic, logical_sector, buffer, checksum);
1741                         r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block,
1742                                                 &dio->metadata_offset, ic->tag_size, TAG_CMP);
1743                         if (r) {
1744                                 if (r > 0) {
1745                                         DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
1746                                                     bio->bi_bdev, logical_sector);
1747                                         atomic64_inc(&ic->number_of_mismatches);
1748                                         dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",
1749                                                          bio, logical_sector, 0);
1750                                         r = -EILSEQ;
1751                                 }
1752                                 dio->bi_status = errno_to_blk_status(r);
1753                                 goto free_ret;
1754                         }
1755
1756                         mem = bvec_kmap_local(&bv);
1757                         memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT);
1758                         kunmap_local(mem);
1759
1760                         pos += ic->sectors_per_block << SECTOR_SHIFT;
1761                         sector += ic->sectors_per_block;
1762                         logical_sector += ic->sectors_per_block;
1763                 } while (pos < bv.bv_len);
1764         }
1765 free_ret:
1766         mempool_free(page, &ic->recheck_pool);
1767 }
1768
1769 static void integrity_metadata(struct work_struct *w)
1770 {
1771         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1772         struct dm_integrity_c *ic = dio->ic;
1773
1774         int r;
1775
1776         if (ic->internal_hash) {
1777                 struct bvec_iter iter;
1778                 struct bio_vec bv;
1779                 unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
1780                 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1781                 char *checksums;
1782                 unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
1783                 char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
1784                 sector_t sector;
1785                 unsigned int sectors_to_process;
1786
1787                 if (unlikely(ic->mode == 'R'))
1788                         goto skip_io;
1789
1790                 if (likely(dio->op != REQ_OP_DISCARD))
1791                         checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
1792                                             GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
1793                 else
1794                         checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
1795                 if (!checksums) {
1796                         checksums = checksums_onstack;
1797                         if (WARN_ON(extra_space &&
1798                                     digest_size > sizeof(checksums_onstack))) {
1799                                 r = -EINVAL;
1800                                 goto error;
1801                         }
1802                 }
1803
1804                 if (unlikely(dio->op == REQ_OP_DISCARD)) {
1805                         unsigned int bi_size = dio->bio_details.bi_iter.bi_size;
1806                         unsigned int max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE;
1807                         unsigned int max_blocks = max_size / ic->tag_size;
1808
1809                         memset(checksums, DISCARD_FILLER, max_size);
1810
1811                         while (bi_size) {
1812                                 unsigned int this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
1813
1814                                 this_step_blocks = min(this_step_blocks, max_blocks);
1815                                 r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
1816                                                         this_step_blocks * ic->tag_size, TAG_WRITE);
1817                                 if (unlikely(r)) {
1818                                         if (likely(checksums != checksums_onstack))
1819                                                 kfree(checksums);
1820                                         goto error;
1821                                 }
1822
1823                                 bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
1824                         }
1825
1826                         if (likely(checksums != checksums_onstack))
1827                                 kfree(checksums);
1828                         goto skip_io;
1829                 }
1830
1831                 sector = dio->range.logical_sector;
1832                 sectors_to_process = dio->range.n_sectors;
1833
1834                 __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
1835                         struct bio_vec bv_copy = bv;
1836                         unsigned int pos;
1837                         char *mem, *checksums_ptr;
1838
1839 again:
1840                         mem = bvec_kmap_local(&bv_copy);
1841                         pos = 0;
1842                         checksums_ptr = checksums;
1843                         do {
1844                                 integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
1845                                 checksums_ptr += ic->tag_size;
1846                                 sectors_to_process -= ic->sectors_per_block;
1847                                 pos += ic->sectors_per_block << SECTOR_SHIFT;
1848                                 sector += ic->sectors_per_block;
1849                         } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack);
1850                         kunmap_local(mem);
1851
1852                         r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
1853                                                 checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
1854                         if (unlikely(r)) {
1855                                 if (likely(checksums != checksums_onstack))
1856                                         kfree(checksums);
1857                                 if (r > 0) {
1858                                         integrity_recheck(dio, checksums_onstack);
1859                                         goto skip_io;
1860                                 }
1861                                 goto error;
1862                         }
1863
1864                         if (!sectors_to_process)
1865                                 break;
1866
1867                         if (unlikely(pos < bv_copy.bv_len)) {
1868                                 bv_copy.bv_offset += pos;
1869                                 bv_copy.bv_len -= pos;
1870                                 goto again;
1871                         }
1872                 }
1873
1874                 if (likely(checksums != checksums_onstack))
1875                         kfree(checksums);
1876         } else {
1877                 struct bio_integrity_payload *bip = dio->bio_details.bi_integrity;
1878
1879                 if (bip) {
1880                         struct bio_vec biv;
1881                         struct bvec_iter iter;
1882                         unsigned int data_to_process = dio->range.n_sectors;
1883
1884                         sector_to_block(ic, data_to_process);
1885                         data_to_process *= ic->tag_size;
1886
1887                         bip_for_each_vec(biv, bip, iter) {
1888                                 unsigned char *tag;
1889                                 unsigned int this_len;
1890
1891                                 BUG_ON(PageHighMem(biv.bv_page));
1892                                 tag = bvec_virt(&biv);
1893                                 this_len = min(biv.bv_len, data_to_process);
1894                                 r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
1895                                                         this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE);
1896                                 if (unlikely(r))
1897                                         goto error;
1898                                 data_to_process -= this_len;
1899                                 if (!data_to_process)
1900                                         break;
1901                         }
1902                 }
1903         }
1904 skip_io:
1905         dec_in_flight(dio);
1906         return;
1907 error:
1908         dio->bi_status = errno_to_blk_status(r);
1909         dec_in_flight(dio);
1910 }
1911
1912 static inline bool dm_integrity_check_limits(struct dm_integrity_c *ic, sector_t logical_sector, struct bio *bio)
1913 {
1914         if (unlikely(logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
1915                 DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
1916                       logical_sector, bio_sectors(bio),
1917                       ic->provided_data_sectors);
1918                 return false;
1919         }
1920         if (unlikely((logical_sector | bio_sectors(bio)) & (unsigned int)(ic->sectors_per_block - 1))) {
1921                 DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
1922                       ic->sectors_per_block,
1923                       logical_sector, bio_sectors(bio));
1924                 return false;
1925         }
1926         if (ic->sectors_per_block > 1 && likely(bio_op(bio) != REQ_OP_DISCARD)) {
1927                 struct bvec_iter iter;
1928                 struct bio_vec bv;
1929
1930                 bio_for_each_segment(bv, bio, iter) {
1931                         if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
1932                                 DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
1933                                         bv.bv_offset, bv.bv_len, ic->sectors_per_block);
1934                                 return false;
1935                         }
1936                 }
1937         }
1938         return true;
1939 }
1940
1941 static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
1942 {
1943         struct dm_integrity_c *ic = ti->private;
1944         struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1945         struct bio_integrity_payload *bip;
1946
1947         sector_t area, offset;
1948
1949         dio->ic = ic;
1950         dio->bi_status = 0;
1951         dio->op = bio_op(bio);
1952
1953         if (ic->mode == 'I') {
1954                 bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector);
1955                 dio->integrity_payload = NULL;
1956                 dio->integrity_payload_from_mempool = false;
1957                 dio->integrity_range_locked = false;
1958                 return dm_integrity_map_inline(dio, true);
1959         }
1960
1961         if (unlikely(dio->op == REQ_OP_DISCARD)) {
1962                 if (ti->max_io_len) {
1963                         sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector);
1964                         unsigned int log2_max_io_len = __fls(ti->max_io_len);
1965                         sector_t start_boundary = sec >> log2_max_io_len;
1966                         sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len;
1967
1968                         if (start_boundary < end_boundary) {
1969                                 sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1));
1970
1971                                 dm_accept_partial_bio(bio, len);
1972                         }
1973                 }
1974         }
1975
1976         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1977                 submit_flush_bio(ic, dio);
1978                 return DM_MAPIO_SUBMITTED;
1979         }
1980
1981         dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1982         dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA;
1983         if (unlikely(dio->fua)) {
1984                 /*
1985                  * Don't pass down the FUA flag because we have to flush
1986                  * disk cache anyway.
1987                  */
1988                 bio->bi_opf &= ~REQ_FUA;
1989         }
1990         if (unlikely(!dm_integrity_check_limits(ic, dio->range.logical_sector, bio)))
1991                 return DM_MAPIO_KILL;
1992
1993         bip = bio_integrity(bio);
1994         if (!ic->internal_hash) {
1995                 if (bip) {
1996                         unsigned int wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
1997
1998                         if (ic->log2_tag_size >= 0)
1999                                 wanted_tag_size <<= ic->log2_tag_size;
2000                         else
2001                                 wanted_tag_size *= ic->tag_size;
2002                         if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
2003                                 DMERR("Invalid integrity data size %u, expected %u",
2004                                       bip->bip_iter.bi_size, wanted_tag_size);
2005                                 return DM_MAPIO_KILL;
2006                         }
2007                 }
2008         } else {
2009                 if (unlikely(bip != NULL)) {
2010                         DMERR("Unexpected integrity data when using internal hash");
2011                         return DM_MAPIO_KILL;
2012                 }
2013         }
2014
2015         if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ))
2016                 return DM_MAPIO_KILL;
2017
2018         get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
2019         dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
2020         bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
2021
2022         dm_integrity_map_continue(dio, true);
2023         return DM_MAPIO_SUBMITTED;
2024 }
2025
2026 static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
2027                                  unsigned int journal_section, unsigned int journal_entry)
2028 {
2029         struct dm_integrity_c *ic = dio->ic;
2030         sector_t logical_sector;
2031         unsigned int n_sectors;
2032
2033         logical_sector = dio->range.logical_sector;
2034         n_sectors = dio->range.n_sectors;
2035         do {
2036                 struct bio_vec bv = bio_iovec(bio);
2037                 char *mem;
2038
2039                 if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
2040                         bv.bv_len = n_sectors << SECTOR_SHIFT;
2041                 n_sectors -= bv.bv_len >> SECTOR_SHIFT;
2042                 bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
2043 retry_kmap:
2044                 mem = kmap_local_page(bv.bv_page);
2045                 if (likely(dio->op == REQ_OP_WRITE))
2046                         flush_dcache_page(bv.bv_page);
2047
2048                 do {
2049                         struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
2050
2051                         if (unlikely(dio->op == REQ_OP_READ)) {
2052                                 struct journal_sector *js;
2053                                 char *mem_ptr;
2054                                 unsigned int s;
2055
2056                                 if (unlikely(journal_entry_is_inprogress(je))) {
2057                                         flush_dcache_page(bv.bv_page);
2058                                         kunmap_local(mem);
2059
2060                                         __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
2061                                         goto retry_kmap;
2062                                 }
2063                                 smp_rmb();
2064                                 BUG_ON(journal_entry_get_sector(je) != logical_sector);
2065                                 js = access_journal_data(ic, journal_section, journal_entry);
2066                                 mem_ptr = mem + bv.bv_offset;
2067                                 s = 0;
2068                                 do {
2069                                         memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
2070                                         *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
2071                                         js++;
2072                                         mem_ptr += 1 << SECTOR_SHIFT;
2073                                 } while (++s < ic->sectors_per_block);
2074 #ifdef INTERNAL_VERIFY
2075                                 if (ic->internal_hash) {
2076                                         char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
2077
2078                                         integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
2079                                         if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
2080                                                 DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
2081                                                             logical_sector);
2082                                                 dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
2083                                                                  bio, logical_sector, 0);
2084                                         }
2085                                 }
2086 #endif
2087                         }
2088
2089                         if (!ic->internal_hash) {
2090                                 struct bio_integrity_payload *bip = bio_integrity(bio);
2091                                 unsigned int tag_todo = ic->tag_size;
2092                                 char *tag_ptr = journal_entry_tag(ic, je);
2093
2094                                 if (bip) {
2095                                         do {
2096                                                 struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
2097                                                 unsigned int tag_now = min(biv.bv_len, tag_todo);
2098                                                 char *tag_addr;
2099
2100                                                 BUG_ON(PageHighMem(biv.bv_page));
2101                                                 tag_addr = bvec_virt(&biv);
2102                                                 if (likely(dio->op == REQ_OP_WRITE))
2103                                                         memcpy(tag_ptr, tag_addr, tag_now);
2104                                                 else
2105                                                         memcpy(tag_addr, tag_ptr, tag_now);
2106                                                 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
2107                                                 tag_ptr += tag_now;
2108                                                 tag_todo -= tag_now;
2109                                         } while (unlikely(tag_todo));
2110                                 } else if (likely(dio->op == REQ_OP_WRITE))
2111                                         memset(tag_ptr, 0, tag_todo);
2112                         }
2113
2114                         if (likely(dio->op == REQ_OP_WRITE)) {
2115                                 struct journal_sector *js;
2116                                 unsigned int s;
2117
2118                                 js = access_journal_data(ic, journal_section, journal_entry);
2119                                 memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
2120
2121                                 s = 0;
2122                                 do {
2123                                         je->last_bytes[s] = js[s].commit_id;
2124                                 } while (++s < ic->sectors_per_block);
2125
2126                                 if (ic->internal_hash) {
2127                                         unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
2128
2129                                         if (unlikely(digest_size > ic->tag_size)) {
2130                                                 char checksums_onstack[HASH_MAX_DIGESTSIZE];
2131
2132                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
2133                                                 memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
2134                                         } else
2135                                                 integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
2136                                 }
2137
2138                                 journal_entry_set_sector(je, logical_sector);
2139                         }
2140                         logical_sector += ic->sectors_per_block;
2141
2142                         journal_entry++;
2143                         if (unlikely(journal_entry == ic->journal_section_entries)) {
2144                                 journal_entry = 0;
2145                                 journal_section++;
2146                                 wraparound_section(ic, &journal_section);
2147                         }
2148
2149                         bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
2150                 } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
2151
2152                 if (unlikely(dio->op == REQ_OP_READ))
2153                         flush_dcache_page(bv.bv_page);
2154                 kunmap_local(mem);
2155         } while (n_sectors);
2156
2157         if (likely(dio->op == REQ_OP_WRITE)) {
2158                 smp_mb();
2159                 if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
2160                         wake_up(&ic->copy_to_journal_wait);
2161                 if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
2162                         queue_work(ic->commit_wq, &ic->commit_work);
2163                 else
2164                         schedule_autocommit(ic);
2165         } else
2166                 remove_range(ic, &dio->range);
2167
2168         if (unlikely(bio->bi_iter.bi_size)) {
2169                 sector_t area, offset;
2170
2171                 dio->range.logical_sector = logical_sector;
2172                 get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
2173                 dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
2174                 return true;
2175         }
2176
2177         return false;
2178 }
2179
2180 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
2181 {
2182         struct dm_integrity_c *ic = dio->ic;
2183         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
2184         unsigned int journal_section, journal_entry;
2185         unsigned int journal_read_pos;
2186         sector_t recalc_sector;
2187         struct completion read_comp;
2188         bool discard_retried = false;
2189         bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ;
2190
2191         if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D')
2192                 need_sync_io = true;
2193
2194         if (need_sync_io && from_map) {
2195                 INIT_WORK(&dio->work, integrity_bio_wait);
2196                 queue_work(ic->offload_wq, &dio->work);
2197                 return;
2198         }
2199
2200 lock_retry:
2201         spin_lock_irq(&ic->endio_wait.lock);
2202 retry:
2203         if (unlikely(dm_integrity_failed(ic))) {
2204                 spin_unlock_irq(&ic->endio_wait.lock);
2205                 do_endio(ic, bio);
2206                 return;
2207         }
2208         dio->range.n_sectors = bio_sectors(bio);
2209         journal_read_pos = NOT_FOUND;
2210         if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) {
2211                 if (dio->op == REQ_OP_WRITE) {
2212                         unsigned int next_entry, i, pos;
2213                         unsigned int ws, we, range_sectors;
2214
2215                         dio->range.n_sectors = min(dio->range.n_sectors,
2216                                                    (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
2217                         if (unlikely(!dio->range.n_sectors)) {
2218                                 if (from_map)
2219                                         goto offload_to_thread;
2220                                 sleep_on_endio_wait(ic);
2221                                 goto retry;
2222                         }
2223                         range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
2224                         ic->free_sectors -= range_sectors;
2225                         journal_section = ic->free_section;
2226                         journal_entry = ic->free_section_entry;
2227
2228                         next_entry = ic->free_section_entry + range_sectors;
2229                         ic->free_section_entry = next_entry % ic->journal_section_entries;
2230                         ic->free_section += next_entry / ic->journal_section_entries;
2231                         ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
2232                         wraparound_section(ic, &ic->free_section);
2233
2234                         pos = journal_section * ic->journal_section_entries + journal_entry;
2235                         ws = journal_section;
2236                         we = journal_entry;
2237                         i = 0;
2238                         do {
2239                                 struct journal_entry *je;
2240
2241                                 add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
2242                                 pos++;
2243                                 if (unlikely(pos >= ic->journal_entries))
2244                                         pos = 0;
2245
2246                                 je = access_journal_entry(ic, ws, we);
2247                                 BUG_ON(!journal_entry_is_unused(je));
2248                                 journal_entry_set_inprogress(je);
2249                                 we++;
2250                                 if (unlikely(we == ic->journal_section_entries)) {
2251                                         we = 0;
2252                                         ws++;
2253                                         wraparound_section(ic, &ws);
2254                                 }
2255                         } while ((i += ic->sectors_per_block) < dio->range.n_sectors);
2256
2257                         spin_unlock_irq(&ic->endio_wait.lock);
2258                         goto journal_read_write;
2259                 } else {
2260                         sector_t next_sector;
2261
2262                         journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
2263                         if (likely(journal_read_pos == NOT_FOUND)) {
2264                                 if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
2265                                         dio->range.n_sectors = next_sector - dio->range.logical_sector;
2266                         } else {
2267                                 unsigned int i;
2268                                 unsigned int jp = journal_read_pos + 1;
2269
2270                                 for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
2271                                         if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
2272                                                 break;
2273                                 }
2274                                 dio->range.n_sectors = i;
2275                         }
2276                 }
2277         }
2278         if (unlikely(!add_new_range(ic, &dio->range, true))) {
2279                 /*
2280                  * We must not sleep in the request routine because it could
2281                  * stall bios on current->bio_list.
2282                  * So, we offload the bio to a workqueue if we have to sleep.
2283                  */
2284                 if (from_map) {
2285 offload_to_thread:
2286                         spin_unlock_irq(&ic->endio_wait.lock);
2287                         INIT_WORK(&dio->work, integrity_bio_wait);
2288                         queue_work(ic->wait_wq, &dio->work);
2289                         return;
2290                 }
2291                 if (journal_read_pos != NOT_FOUND)
2292                         dio->range.n_sectors = ic->sectors_per_block;
2293                 wait_and_add_new_range(ic, &dio->range);
2294                 /*
2295                  * wait_and_add_new_range drops the spinlock, so the journal
2296                  * may have been changed arbitrarily. We need to recheck.
2297                  * To simplify the code, we restrict I/O size to just one block.
2298                  */
2299                 if (journal_read_pos != NOT_FOUND) {
2300                         sector_t next_sector;
2301                         unsigned int new_pos;
2302
2303                         new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
2304                         if (unlikely(new_pos != journal_read_pos)) {
2305                                 remove_range_unlocked(ic, &dio->range);
2306                                 goto retry;
2307                         }
2308                 }
2309         }
2310         if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) {
2311                 sector_t next_sector;
2312                 unsigned int new_pos;
2313
2314                 new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
2315                 if (unlikely(new_pos != NOT_FOUND) ||
2316                     unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) {
2317                         remove_range_unlocked(ic, &dio->range);
2318                         spin_unlock_irq(&ic->endio_wait.lock);
2319                         queue_work(ic->commit_wq, &ic->commit_work);
2320                         flush_workqueue(ic->commit_wq);
2321                         queue_work(ic->writer_wq, &ic->writer_work);
2322                         flush_workqueue(ic->writer_wq);
2323                         discard_retried = true;
2324                         goto lock_retry;
2325                 }
2326         }
2327         recalc_sector = le64_to_cpu(ic->sb->recalc_sector);
2328         spin_unlock_irq(&ic->endio_wait.lock);
2329
2330         if (unlikely(journal_read_pos != NOT_FOUND)) {
2331                 journal_section = journal_read_pos / ic->journal_section_entries;
2332                 journal_entry = journal_read_pos % ic->journal_section_entries;
2333                 goto journal_read_write;
2334         }
2335
2336         if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) {
2337                 if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
2338                                      dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
2339                         struct bitmap_block_status *bbs;
2340
2341                         bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
2342                         spin_lock(&bbs->bio_queue_lock);
2343                         bio_list_add(&bbs->bio_queue, bio);
2344                         spin_unlock(&bbs->bio_queue_lock);
2345                         queue_work(ic->writer_wq, &bbs->work);
2346                         return;
2347                 }
2348         }
2349
2350         dio->in_flight = (atomic_t)ATOMIC_INIT(2);
2351
2352         if (need_sync_io) {
2353                 init_completion(&read_comp);
2354                 dio->completion = &read_comp;
2355         } else
2356                 dio->completion = NULL;
2357
2358         dm_bio_record(&dio->bio_details, bio);
2359         bio_set_dev(bio, ic->dev->bdev);
2360         bio->bi_integrity = NULL;
2361         bio->bi_opf &= ~REQ_INTEGRITY;
2362         bio->bi_end_io = integrity_end_io;
2363         bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
2364
2365         if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) {
2366                 integrity_metadata(&dio->work);
2367                 dm_integrity_flush_buffers(ic, false);
2368
2369                 dio->in_flight = (atomic_t)ATOMIC_INIT(1);
2370                 dio->completion = NULL;
2371
2372                 submit_bio_noacct(bio);
2373
2374                 return;
2375         }
2376
2377         submit_bio_noacct(bio);
2378
2379         if (need_sync_io) {
2380                 wait_for_completion_io(&read_comp);
2381                 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
2382                     dio->range.logical_sector + dio->range.n_sectors > recalc_sector)
2383                         goto skip_check;
2384                 if (ic->mode == 'B') {
2385                         if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
2386                                              dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
2387                                 goto skip_check;
2388                 }
2389
2390                 if (likely(!bio->bi_status))
2391                         integrity_metadata(&dio->work);
2392                 else
2393 skip_check:
2394                         dec_in_flight(dio);
2395         } else {
2396                 INIT_WORK(&dio->work, integrity_metadata);
2397                 queue_work(ic->metadata_wq, &dio->work);
2398         }
2399
2400         return;
2401
2402 journal_read_write:
2403         if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
2404                 goto lock_retry;
2405
2406         do_endio_flush(ic, dio);
2407 }
2408
2409 static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map)
2410 {
2411         struct dm_integrity_c *ic = dio->ic;
2412         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
2413         struct bio_integrity_payload *bip;
2414         unsigned ret;
2415         sector_t recalc_sector;
2416
2417         if (unlikely(bio_integrity(bio))) {
2418                 bio->bi_status = BLK_STS_NOTSUPP;
2419                 bio_endio(bio);
2420                 return DM_MAPIO_SUBMITTED;
2421         }
2422
2423         bio_set_dev(bio, ic->dev->bdev);
2424         if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0))
2425                 return DM_MAPIO_REMAPPED;
2426
2427 retry:
2428         if (!dio->integrity_payload) {
2429                 unsigned digest_size, extra_size;
2430                 dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block);
2431                 digest_size = crypto_shash_digestsize(ic->internal_hash);
2432                 extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
2433                 dio->payload_len += extra_size;
2434                 dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
2435                 if (unlikely(!dio->integrity_payload)) {
2436                         const unsigned x_size = PAGE_SIZE << 1;
2437                         if (dio->payload_len > x_size) {
2438                                 unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block;
2439                                 if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) {
2440                                         bio->bi_status = BLK_STS_NOTSUPP;
2441                                         bio_endio(bio);
2442                                         return DM_MAPIO_SUBMITTED;
2443                                 }
2444                                 dm_accept_partial_bio(bio, sectors);
2445                                 goto retry;
2446                         }
2447                 }
2448         }
2449
2450         dio->range.logical_sector = bio->bi_iter.bi_sector;
2451         dio->range.n_sectors = bio_sectors(bio);
2452
2453         if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)))
2454                 goto skip_spinlock;
2455 #ifdef CONFIG_64BIT
2456         /*
2457          * On 64-bit CPUs we can optimize the lock away (so that it won't cause
2458          * cache line bouncing) and use acquire/release barriers instead.
2459          *
2460          * Paired with smp_store_release in integrity_recalc_inline.
2461          */
2462         recalc_sector = le64_to_cpu(smp_load_acquire(&ic->sb->recalc_sector));
2463         if (likely(dio->range.logical_sector + dio->range.n_sectors <= recalc_sector))
2464                 goto skip_spinlock;
2465 #endif
2466         spin_lock_irq(&ic->endio_wait.lock);
2467         recalc_sector = le64_to_cpu(ic->sb->recalc_sector);
2468         if (dio->range.logical_sector + dio->range.n_sectors <= recalc_sector)
2469                 goto skip_unlock;
2470         if (unlikely(!add_new_range(ic, &dio->range, true))) {
2471                 if (from_map) {
2472                         spin_unlock_irq(&ic->endio_wait.lock);
2473                         INIT_WORK(&dio->work, integrity_bio_wait);
2474                         queue_work(ic->wait_wq, &dio->work);
2475                         return DM_MAPIO_SUBMITTED;
2476                 }
2477                 wait_and_add_new_range(ic, &dio->range);
2478         }
2479         dio->integrity_range_locked = true;
2480 skip_unlock:
2481         spin_unlock_irq(&ic->endio_wait.lock);
2482 skip_spinlock:
2483
2484         if (unlikely(!dio->integrity_payload)) {
2485                 dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO));
2486                 dio->integrity_payload_from_mempool = true;
2487         }
2488
2489         dio->bio_details.bi_iter = bio->bi_iter;
2490
2491         if (unlikely(!dm_integrity_check_limits(ic, bio->bi_iter.bi_sector, bio))) {
2492                 return DM_MAPIO_KILL;
2493         }
2494
2495         bio->bi_iter.bi_sector += ic->start + SB_SECTORS;
2496
2497         bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
2498         if (IS_ERR(bip)) {
2499                 bio->bi_status = errno_to_blk_status(PTR_ERR(bip));
2500                 bio_endio(bio);
2501                 return DM_MAPIO_SUBMITTED;
2502         }
2503
2504         if (dio->op == REQ_OP_WRITE) {
2505                 unsigned pos = 0;
2506                 while (dio->bio_details.bi_iter.bi_size) {
2507                         struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
2508                         const char *mem = bvec_kmap_local(&bv);
2509                         if (ic->tag_size < ic->tuple_size)
2510                                 memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size);
2511                         integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos);
2512                         kunmap_local(mem);
2513                         pos += ic->tuple_size;
2514                         bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
2515                 }
2516         }
2517
2518         ret = bio_integrity_add_page(bio, virt_to_page(dio->integrity_payload),
2519                                         dio->payload_len, offset_in_page(dio->integrity_payload));
2520         if (unlikely(ret != dio->payload_len)) {
2521                 bio->bi_status = BLK_STS_RESOURCE;
2522                 bio_endio(bio);
2523                 return DM_MAPIO_SUBMITTED;
2524         }
2525
2526         return DM_MAPIO_REMAPPED;
2527 }
2528
2529 static inline void dm_integrity_free_payload(struct dm_integrity_io *dio)
2530 {
2531         struct dm_integrity_c *ic = dio->ic;
2532         if (unlikely(dio->integrity_payload_from_mempool))
2533                 mempool_free(virt_to_page(dio->integrity_payload), &ic->recheck_pool);
2534         else
2535                 kfree(dio->integrity_payload);
2536         dio->integrity_payload = NULL;
2537         dio->integrity_payload_from_mempool = false;
2538 }
2539
2540 static void dm_integrity_inline_recheck(struct work_struct *w)
2541 {
2542         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
2543         struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
2544         struct dm_integrity_c *ic = dio->ic;
2545         struct bio *outgoing_bio;
2546         void *outgoing_data;
2547
2548         dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO));
2549         dio->integrity_payload_from_mempool = true;
2550
2551         outgoing_data = dio->integrity_payload + PAGE_SIZE;
2552
2553         while (dio->bio_details.bi_iter.bi_size) {
2554                 char digest[HASH_MAX_DIGESTSIZE];
2555                 int r;
2556                 struct bio_integrity_payload *bip;
2557                 struct bio_vec bv;
2558                 char *mem;
2559
2560                 outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
2561
2562                 r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
2563                 if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
2564                         bio_put(outgoing_bio);
2565                         bio->bi_status = BLK_STS_RESOURCE;
2566                         bio_endio(bio);
2567                         return;
2568                 }
2569
2570                 bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
2571                 if (IS_ERR(bip)) {
2572                         bio_put(outgoing_bio);
2573                         bio->bi_status = errno_to_blk_status(PTR_ERR(bip));
2574                         bio_endio(bio);
2575                         return;
2576                 }
2577
2578                 r = bio_integrity_add_page(outgoing_bio, virt_to_page(dio->integrity_payload), ic->tuple_size, 0);
2579                 if (unlikely(r != ic->tuple_size)) {
2580                         bio_put(outgoing_bio);
2581                         bio->bi_status = BLK_STS_RESOURCE;
2582                         bio_endio(bio);
2583                         return;
2584                 }
2585
2586                 outgoing_bio->bi_iter.bi_sector = dio->bio_details.bi_iter.bi_sector + ic->start + SB_SECTORS;
2587
2588                 r = submit_bio_wait(outgoing_bio);
2589                 if (unlikely(r != 0)) {
2590                         bio_put(outgoing_bio);
2591                         bio->bi_status = errno_to_blk_status(r);
2592                         bio_endio(bio);
2593                         return;
2594                 }
2595                 bio_put(outgoing_bio);
2596
2597                 integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
2598                 if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
2599                         DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
2600                                 ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
2601                         atomic64_inc(&ic->number_of_mismatches);
2602                         dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",
2603                                 bio, dio->bio_details.bi_iter.bi_sector, 0);
2604
2605                         bio->bi_status = BLK_STS_PROTECTION;
2606                         bio_endio(bio);
2607                         return;
2608                 }
2609
2610                 bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
2611                 mem = bvec_kmap_local(&bv);
2612                 memcpy(mem, outgoing_data, ic->sectors_per_block << SECTOR_SHIFT);
2613                 kunmap_local(mem);
2614
2615                 bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
2616         }
2617
2618         bio_endio(bio);
2619 }
2620
2621 static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
2622 {
2623         struct dm_integrity_c *ic = ti->private;
2624         if (ic->mode == 'I') {
2625                 struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
2626                 if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) {
2627                         unsigned pos = 0;
2628                         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
2629                             unlikely(dio->integrity_range_locked))
2630                                 goto skip_check;
2631                         while (dio->bio_details.bi_iter.bi_size) {
2632                                 char digest[HASH_MAX_DIGESTSIZE];
2633                                 struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
2634                                 char *mem = bvec_kmap_local(&bv);
2635                                 //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
2636                                 integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
2637                                 if (unlikely(memcmp(digest, dio->integrity_payload + pos,
2638                                                 min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
2639                                         kunmap_local(mem);
2640                                         dm_integrity_free_payload(dio);
2641                                         INIT_WORK(&dio->work, dm_integrity_inline_recheck);
2642                                         queue_work(ic->offload_wq, &dio->work);
2643                                         return DM_ENDIO_INCOMPLETE;
2644                                 }
2645                                 kunmap_local(mem);
2646                                 pos += ic->tuple_size;
2647                                 bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
2648                         }
2649                 }
2650 skip_check:
2651                 dm_integrity_free_payload(dio);
2652                 if (unlikely(dio->integrity_range_locked))
2653                         remove_range(ic, &dio->range);
2654         }
2655         return DM_ENDIO_DONE;
2656 }
2657
2658 static void integrity_bio_wait(struct work_struct *w)
2659 {
2660         struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
2661         struct dm_integrity_c *ic = dio->ic;
2662
2663         if (ic->mode == 'I') {
2664                 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
2665                 int r = dm_integrity_map_inline(dio, false);
2666                 switch (r) {
2667                         case DM_MAPIO_KILL:
2668                                 bio->bi_status = BLK_STS_IOERR;
2669                                 fallthrough;
2670                         case DM_MAPIO_REMAPPED:
2671                                 submit_bio_noacct(bio);
2672                                 fallthrough;
2673                         case DM_MAPIO_SUBMITTED:
2674                                 return;
2675                         default:
2676                                 BUG();
2677                 }
2678         } else {
2679                 dm_integrity_map_continue(dio, false);
2680         }
2681 }
2682
2683 static void pad_uncommitted(struct dm_integrity_c *ic)
2684 {
2685         if (ic->free_section_entry) {
2686                 ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
2687                 ic->free_section_entry = 0;
2688                 ic->free_section++;
2689                 wraparound_section(ic, &ic->free_section);
2690                 ic->n_uncommitted_sections++;
2691         }
2692         if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
2693                     (ic->n_uncommitted_sections + ic->n_committed_sections) *
2694                     ic->journal_section_entries + ic->free_sectors)) {
2695                 DMCRIT("journal_sections %u, journal_section_entries %u, "
2696                        "n_uncommitted_sections %u, n_committed_sections %u, "
2697                        "journal_section_entries %u, free_sectors %u",
2698                        ic->journal_sections, ic->journal_section_entries,
2699                        ic->n_uncommitted_sections, ic->n_committed_sections,
2700                        ic->journal_section_entries, ic->free_sectors);
2701         }
2702 }
2703
2704 static void integrity_commit(struct work_struct *w)
2705 {
2706         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
2707         unsigned int commit_start, commit_sections;
2708         unsigned int i, j, n;
2709         struct bio *flushes;
2710
2711         del_timer(&ic->autocommit_timer);
2712
2713         if (ic->mode == 'I')
2714                 return;
2715
2716         spin_lock_irq(&ic->endio_wait.lock);
2717         flushes = bio_list_get(&ic->flush_bio_list);
2718         if (unlikely(ic->mode != 'J')) {
2719                 spin_unlock_irq(&ic->endio_wait.lock);
2720                 dm_integrity_flush_buffers(ic, true);
2721                 goto release_flush_bios;
2722         }
2723
2724         pad_uncommitted(ic);
2725         commit_start = ic->uncommitted_section;
2726         commit_sections = ic->n_uncommitted_sections;
2727         spin_unlock_irq(&ic->endio_wait.lock);
2728
2729         if (!commit_sections)
2730                 goto release_flush_bios;
2731
2732         ic->wrote_to_journal = true;
2733
2734         i = commit_start;
2735         for (n = 0; n < commit_sections; n++) {
2736                 for (j = 0; j < ic->journal_section_entries; j++) {
2737                         struct journal_entry *je;
2738
2739                         je = access_journal_entry(ic, i, j);
2740                         io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
2741                 }
2742                 for (j = 0; j < ic->journal_section_sectors; j++) {
2743                         struct journal_sector *js;
2744
2745                         js = access_journal(ic, i, j);
2746                         js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
2747                 }
2748                 i++;
2749                 if (unlikely(i >= ic->journal_sections))
2750                         ic->commit_seq = next_commit_seq(ic->commit_seq);
2751                 wraparound_section(ic, &i);
2752         }
2753         smp_rmb();
2754
2755         write_journal(ic, commit_start, commit_sections);
2756
2757         spin_lock_irq(&ic->endio_wait.lock);
2758         ic->uncommitted_section += commit_sections;
2759         wraparound_section(ic, &ic->uncommitted_section);
2760         ic->n_uncommitted_sections -= commit_sections;
2761         ic->n_committed_sections += commit_sections;
2762         spin_unlock_irq(&ic->endio_wait.lock);
2763
2764         if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
2765                 queue_work(ic->writer_wq, &ic->writer_work);
2766
2767 release_flush_bios:
2768         while (flushes) {
2769                 struct bio *next = flushes->bi_next;
2770
2771                 flushes->bi_next = NULL;
2772                 do_endio(ic, flushes);
2773                 flushes = next;
2774         }
2775 }
2776
2777 static void complete_copy_from_journal(unsigned long error, void *context)
2778 {
2779         struct journal_io *io = context;
2780         struct journal_completion *comp = io->comp;
2781         struct dm_integrity_c *ic = comp->ic;
2782
2783         remove_range(ic, &io->range);
2784         mempool_free(io, &ic->journal_io_mempool);
2785         if (unlikely(error != 0))
2786                 dm_integrity_io_error(ic, "copying from journal", -EIO);
2787         complete_journal_op(comp);
2788 }
2789
2790 static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
2791                                struct journal_entry *je)
2792 {
2793         unsigned int s = 0;
2794
2795         do {
2796                 js->commit_id = je->last_bytes[s];
2797                 js++;
2798         } while (++s < ic->sectors_per_block);
2799 }
2800
2801 static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start,
2802                              unsigned int write_sections, bool from_replay)
2803 {
2804         unsigned int i, j, n;
2805         struct journal_completion comp;
2806         struct blk_plug plug;
2807
2808         blk_start_plug(&plug);
2809
2810         comp.ic = ic;
2811         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2812         init_completion(&comp.comp);
2813
2814         i = write_start;
2815         for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
2816 #ifndef INTERNAL_VERIFY
2817                 if (unlikely(from_replay))
2818 #endif
2819                         rw_section_mac(ic, i, false);
2820                 for (j = 0; j < ic->journal_section_entries; j++) {
2821                         struct journal_entry *je = access_journal_entry(ic, i, j);
2822                         sector_t sec, area, offset;
2823                         unsigned int k, l, next_loop;
2824                         sector_t metadata_block;
2825                         unsigned int metadata_offset;
2826                         struct journal_io *io;
2827
2828                         if (journal_entry_is_unused(je))
2829                                 continue;
2830                         BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
2831                         sec = journal_entry_get_sector(je);
2832                         if (unlikely(from_replay)) {
2833                                 if (unlikely(sec & (unsigned int)(ic->sectors_per_block - 1))) {
2834                                         dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
2835                                         sec &= ~(sector_t)(ic->sectors_per_block - 1);
2836                                 }
2837                                 if (unlikely(sec >= ic->provided_data_sectors)) {
2838                                         journal_entry_set_unused(je);
2839                                         continue;
2840                                 }
2841                         }
2842                         get_area_and_offset(ic, sec, &area, &offset);
2843                         restore_last_bytes(ic, access_journal_data(ic, i, j), je);
2844                         for (k = j + 1; k < ic->journal_section_entries; k++) {
2845                                 struct journal_entry *je2 = access_journal_entry(ic, i, k);
2846                                 sector_t sec2, area2, offset2;
2847
2848                                 if (journal_entry_is_unused(je2))
2849                                         break;
2850                                 BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
2851                                 sec2 = journal_entry_get_sector(je2);
2852                                 if (unlikely(sec2 >= ic->provided_data_sectors))
2853                                         break;
2854                                 get_area_and_offset(ic, sec2, &area2, &offset2);
2855                                 if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
2856                                         break;
2857                                 restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
2858                         }
2859                         next_loop = k - 1;
2860
2861                         io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO);
2862                         io->comp = &comp;
2863                         io->range.logical_sector = sec;
2864                         io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
2865
2866                         spin_lock_irq(&ic->endio_wait.lock);
2867                         add_new_range_and_wait(ic, &io->range);
2868
2869                         if (likely(!from_replay)) {
2870                                 struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
2871
2872                                 /* don't write if there is newer committed sector */
2873                                 while (j < k && find_newer_committed_node(ic, &section_node[j])) {
2874                                         struct journal_entry *je2 = access_journal_entry(ic, i, j);
2875
2876                                         journal_entry_set_unused(je2);
2877                                         remove_journal_node(ic, &section_node[j]);
2878                                         j++;
2879                                         sec += ic->sectors_per_block;
2880                                         offset += ic->sectors_per_block;
2881                                 }
2882                                 while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
2883                                         struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
2884
2885                                         journal_entry_set_unused(je2);
2886                                         remove_journal_node(ic, &section_node[k - 1]);
2887                                         k--;
2888                                 }
2889                                 if (j == k) {
2890                                         remove_range_unlocked(ic, &io->range);
2891                                         spin_unlock_irq(&ic->endio_wait.lock);
2892                                         mempool_free(io, &ic->journal_io_mempool);
2893                                         goto skip_io;
2894                                 }
2895                                 for (l = j; l < k; l++)
2896                                         remove_journal_node(ic, &section_node[l]);
2897                         }
2898                         spin_unlock_irq(&ic->endio_wait.lock);
2899
2900                         metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
2901                         for (l = j; l < k; l++) {
2902                                 int r;
2903                                 struct journal_entry *je2 = access_journal_entry(ic, i, l);
2904
2905                                 if (
2906 #ifndef INTERNAL_VERIFY
2907                                     unlikely(from_replay) &&
2908 #endif
2909                                     ic->internal_hash) {
2910                                         char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
2911
2912                                         integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
2913                                                                   (char *)access_journal_data(ic, i, l), test_tag);
2914                                         if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
2915                                                 dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
2916                                                 dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
2917                                         }
2918                                 }
2919
2920                                 journal_entry_set_unused(je2);
2921                                 r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
2922                                                         ic->tag_size, TAG_WRITE);
2923                                 if (unlikely(r))
2924                                         dm_integrity_io_error(ic, "reading tags", r);
2925                         }
2926
2927                         atomic_inc(&comp.in_flight);
2928                         copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
2929                                           (k - j) << ic->sb->log2_sectors_per_block,
2930                                           get_data_sector(ic, area, offset),
2931                                           complete_copy_from_journal, io);
2932 skip_io:
2933                         j = next_loop;
2934                 }
2935         }
2936
2937         dm_bufio_write_dirty_buffers_async(ic->bufio);
2938
2939         blk_finish_plug(&plug);
2940
2941         complete_journal_op(&comp);
2942         wait_for_completion_io(&comp.comp);
2943
2944         dm_integrity_flush_buffers(ic, true);
2945 }
2946
2947 static void integrity_writer(struct work_struct *w)
2948 {
2949         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
2950         unsigned int write_start, write_sections;
2951         unsigned int prev_free_sectors;
2952
2953         spin_lock_irq(&ic->endio_wait.lock);
2954         write_start = ic->committed_section;
2955         write_sections = ic->n_committed_sections;
2956         spin_unlock_irq(&ic->endio_wait.lock);
2957
2958         if (!write_sections)
2959                 return;
2960
2961         do_journal_write(ic, write_start, write_sections, false);
2962
2963         spin_lock_irq(&ic->endio_wait.lock);
2964
2965         ic->committed_section += write_sections;
2966         wraparound_section(ic, &ic->committed_section);
2967         ic->n_committed_sections -= write_sections;
2968
2969         prev_free_sectors = ic->free_sectors;
2970         ic->free_sectors += write_sections * ic->journal_section_entries;
2971         if (unlikely(!prev_free_sectors))
2972                 wake_up_locked(&ic->endio_wait);
2973
2974         spin_unlock_irq(&ic->endio_wait.lock);
2975 }
2976
2977 static void recalc_write_super(struct dm_integrity_c *ic)
2978 {
2979         int r;
2980
2981         dm_integrity_flush_buffers(ic, false);
2982         if (dm_integrity_failed(ic))
2983                 return;
2984
2985         r = sync_rw_sb(ic, REQ_OP_WRITE);
2986         if (unlikely(r))
2987                 dm_integrity_io_error(ic, "writing superblock", r);
2988 }
2989
2990 static void integrity_recalc(struct work_struct *w)
2991 {
2992         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
2993         size_t recalc_tags_size;
2994         u8 *recalc_buffer = NULL;
2995         u8 *recalc_tags = NULL;
2996         struct dm_integrity_range range;
2997         struct dm_io_request io_req;
2998         struct dm_io_region io_loc;
2999         sector_t area, offset;
3000         sector_t metadata_block;
3001         unsigned int metadata_offset;
3002         sector_t logical_sector, n_sectors;
3003         __u8 *t;
3004         unsigned int i;
3005         int r;
3006         unsigned int super_counter = 0;
3007         unsigned recalc_sectors = RECALC_SECTORS;
3008
3009 retry:
3010         recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO);
3011         if (!recalc_buffer) {
3012 oom:
3013                 recalc_sectors >>= 1;
3014                 if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block)
3015                         goto retry;
3016                 DMCRIT("out of memory for recalculate buffer - recalculation disabled");
3017                 goto free_ret;
3018         }
3019         recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
3020         if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
3021                 recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
3022         recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO);
3023         if (!recalc_tags) {
3024                 vfree(recalc_buffer);
3025                 recalc_buffer = NULL;
3026                 goto oom;
3027         }
3028
3029         DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
3030
3031         spin_lock_irq(&ic->endio_wait.lock);
3032
3033 next_chunk:
3034
3035         if (unlikely(dm_post_suspending(ic->ti)))
3036                 goto unlock_ret;
3037
3038         range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
3039         if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
3040                 if (ic->mode == 'B') {
3041                         block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
3042                         DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
3043                         queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
3044                 }
3045                 goto unlock_ret;
3046         }
3047
3048         get_area_and_offset(ic, range.logical_sector, &area, &offset);
3049         range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector);
3050         if (!ic->meta_dev)
3051                 range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned int)offset);
3052
3053         add_new_range_and_wait(ic, &range);
3054         spin_unlock_irq(&ic->endio_wait.lock);
3055         logical_sector = range.logical_sector;
3056         n_sectors = range.n_sectors;
3057
3058         if (ic->mode == 'B') {
3059                 if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
3060                         goto advance_and_next;
3061
3062                 while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
3063                                        ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
3064                         logical_sector += ic->sectors_per_block;
3065                         n_sectors -= ic->sectors_per_block;
3066                         cond_resched();
3067                 }
3068                 while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
3069                                        ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
3070                         n_sectors -= ic->sectors_per_block;
3071                         cond_resched();
3072                 }
3073                 get_area_and_offset(ic, logical_sector, &area, &offset);
3074         }
3075
3076         DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors);
3077
3078         if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
3079                 recalc_write_super(ic);
3080                 if (ic->mode == 'B')
3081                         queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
3082
3083                 super_counter = 0;
3084         }
3085
3086         if (unlikely(dm_integrity_failed(ic)))
3087                 goto err;
3088
3089         io_req.bi_opf = REQ_OP_READ;
3090         io_req.mem.type = DM_IO_VMA;
3091         io_req.mem.ptr.addr = recalc_buffer;
3092         io_req.notify.fn = NULL;
3093         io_req.client = ic->io;
3094         io_loc.bdev = ic->dev->bdev;
3095         io_loc.sector = get_data_sector(ic, area, offset);
3096         io_loc.count = n_sectors;
3097
3098         r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
3099         if (unlikely(r)) {
3100                 dm_integrity_io_error(ic, "reading data", r);
3101                 goto err;
3102         }
3103
3104         t = recalc_tags;
3105         for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
3106                 integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
3107                 t += ic->tag_size;
3108         }
3109
3110         metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
3111
3112         r = dm_integrity_rw_tag(ic, recalc_tags, &metadata_block, &metadata_offset, t - recalc_tags, TAG_WRITE);
3113         if (unlikely(r)) {
3114                 dm_integrity_io_error(ic, "writing tags", r);
3115                 goto err;
3116         }
3117
3118         if (ic->mode == 'B') {
3119                 sector_t start, end;
3120
3121                 start = (range.logical_sector >>
3122                          (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) <<
3123                         (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
3124                 end = ((range.logical_sector + range.n_sectors) >>
3125                        (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) <<
3126                         (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
3127                 block_bitmap_op(ic, ic->recalc_bitmap, start, end - start, BITMAP_OP_CLEAR);
3128         }
3129
3130 advance_and_next:
3131         cond_resched();
3132
3133         spin_lock_irq(&ic->endio_wait.lock);
3134         remove_range_unlocked(ic, &range);
3135         ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
3136         goto next_chunk;
3137
3138 err:
3139         remove_range(ic, &range);
3140         goto free_ret;
3141
3142 unlock_ret:
3143         spin_unlock_irq(&ic->endio_wait.lock);
3144
3145         recalc_write_super(ic);
3146
3147 free_ret:
3148         vfree(recalc_buffer);
3149         kvfree(recalc_tags);
3150 }
3151
3152 static void integrity_recalc_inline(struct work_struct *w)
3153 {
3154         struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
3155         size_t recalc_tags_size;
3156         u8 *recalc_buffer = NULL;
3157         u8 *recalc_tags = NULL;
3158         struct dm_integrity_range range;
3159         struct bio *bio;
3160         struct bio_integrity_payload *bip;
3161         __u8 *t;
3162         unsigned int i;
3163         int r;
3164         unsigned ret;
3165         unsigned int super_counter = 0;
3166         unsigned recalc_sectors = RECALC_SECTORS;
3167
3168 retry:
3169         recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN);
3170         if (!recalc_buffer) {
3171 oom:
3172                 recalc_sectors >>= 1;
3173                 if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block)
3174                         goto retry;
3175                 DMCRIT("out of memory for recalculate buffer - recalculation disabled");
3176                 goto free_ret;
3177         }
3178
3179         recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size;
3180         if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size)
3181                 recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size;
3182         recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN);
3183         if (!recalc_tags) {
3184                 kfree(recalc_buffer);
3185                 recalc_buffer = NULL;
3186                 goto oom;
3187         }
3188
3189         spin_lock_irq(&ic->endio_wait.lock);
3190
3191 next_chunk:
3192         if (unlikely(dm_post_suspending(ic->ti)))
3193                 goto unlock_ret;
3194
3195         range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
3196         if (unlikely(range.logical_sector >= ic->provided_data_sectors))
3197                 goto unlock_ret;
3198         range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector);
3199
3200         add_new_range_and_wait(ic, &range);
3201         spin_unlock_irq(&ic->endio_wait.lock);
3202
3203         if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
3204                 recalc_write_super(ic);
3205                 super_counter = 0;
3206         }
3207
3208         if (unlikely(dm_integrity_failed(ic)))
3209                 goto err;
3210
3211         DEBUG_print("recalculating: %llx - %llx\n", range.logical_sector, range.n_sectors);
3212
3213         bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
3214         bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
3215         __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
3216         r = submit_bio_wait(bio);
3217         bio_put(bio);
3218         if (unlikely(r)) {
3219                 dm_integrity_io_error(ic, "reading data", r);
3220                 goto err;
3221         }
3222
3223         t = recalc_tags;
3224         for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
3225                 memset(t, 0, ic->tuple_size);
3226                 integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
3227                 t += ic->tuple_size;
3228         }
3229
3230         bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
3231         bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
3232         __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
3233
3234         bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
3235         if (unlikely(IS_ERR(bip))) {
3236                 bio_put(bio);
3237                 DMCRIT("out of memory for bio integrity payload - recalculation disabled");
3238                 goto err;
3239         }
3240         ret = bio_integrity_add_page(bio, virt_to_page(recalc_tags), t - recalc_tags, offset_in_page(recalc_tags));
3241         if (unlikely(ret != t - recalc_tags)) {
3242                 bio_put(bio);
3243                 dm_integrity_io_error(ic, "attaching integrity tags", -ENOMEM);
3244                 goto err;
3245         }
3246
3247         r = submit_bio_wait(bio);
3248         bio_put(bio);
3249         if (unlikely(r)) {
3250                 dm_integrity_io_error(ic, "writing data", r);
3251                 goto err;
3252         }
3253
3254         cond_resched();
3255         spin_lock_irq(&ic->endio_wait.lock);
3256         remove_range_unlocked(ic, &range);
3257 #ifdef CONFIG_64BIT
3258         /* Paired with smp_load_acquire in dm_integrity_map_inline. */
3259         smp_store_release(&ic->sb->recalc_sector, cpu_to_le64(range.logical_sector + range.n_sectors));
3260 #else
3261         ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
3262 #endif
3263         goto next_chunk;
3264
3265 err:
3266         remove_range(ic, &range);
3267         goto free_ret;
3268
3269 unlock_ret:
3270         spin_unlock_irq(&ic->endio_wait.lock);
3271
3272         recalc_write_super(ic);
3273
3274 free_ret:
3275         kfree(recalc_buffer);
3276         kfree(recalc_tags);
3277 }
3278
3279 static void bitmap_block_work(struct work_struct *w)
3280 {
3281         struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
3282         struct dm_integrity_c *ic = bbs->ic;
3283         struct bio *bio;
3284         struct bio_list bio_queue;
3285         struct bio_list waiting;
3286
3287         bio_list_init(&waiting);
3288
3289         spin_lock(&bbs->bio_queue_lock);
3290         bio_queue = bbs->bio_queue;
3291         bio_list_init(&bbs->bio_queue);
3292         spin_unlock(&bbs->bio_queue_lock);
3293
3294         while ((bio = bio_list_pop(&bio_queue))) {
3295                 struct dm_integrity_io *dio;
3296
3297                 dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
3298
3299                 if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
3300                                     dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
3301                         remove_range(ic, &dio->range);
3302                         INIT_WORK(&dio->work, integrity_bio_wait);
3303                         queue_work(ic->offload_wq, &dio->work);
3304                 } else {
3305                         block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
3306                                         dio->range.n_sectors, BITMAP_OP_SET);
3307                         bio_list_add(&waiting, bio);
3308                 }
3309         }
3310
3311         if (bio_list_empty(&waiting))
3312                 return;
3313
3314         rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC,
3315                            bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
3316                            BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
3317
3318         while ((bio = bio_list_pop(&waiting))) {
3319                 struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
3320
3321                 block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
3322                                 dio->range.n_sectors, BITMAP_OP_SET);
3323
3324                 remove_range(ic, &dio->range);
3325                 INIT_WORK(&dio->work, integrity_bio_wait);
3326                 queue_work(ic->offload_wq, &dio->work);
3327         }
3328
3329         queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
3330 }
3331
3332 static void bitmap_flush_work(struct work_struct *work)
3333 {
3334         struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
3335         struct dm_integrity_range range;
3336         unsigned long limit;
3337         struct bio *bio;
3338
3339         dm_integrity_flush_buffers(ic, false);
3340
3341         range.logical_sector = 0;
3342         range.n_sectors = ic->provided_data_sectors;
3343
3344         spin_lock_irq(&ic->endio_wait.lock);
3345         add_new_range_and_wait(ic, &range);
3346         spin_unlock_irq(&ic->endio_wait.lock);
3347
3348         dm_integrity_flush_buffers(ic, true);
3349
3350         limit = ic->provided_data_sectors;
3351         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
3352                 limit = le64_to_cpu(ic->sb->recalc_sector)
3353                         >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
3354                         << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
3355         }
3356         /*DEBUG_print("zeroing journal\n");*/
3357         block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
3358         block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
3359
3360         rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0,
3361                            ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
3362
3363         spin_lock_irq(&ic->endio_wait.lock);
3364         remove_range_unlocked(ic, &range);
3365         while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
3366                 bio_endio(bio);
3367                 spin_unlock_irq(&ic->endio_wait.lock);
3368                 spin_lock_irq(&ic->endio_wait.lock);
3369         }
3370         spin_unlock_irq(&ic->endio_wait.lock);
3371 }
3372
3373
3374 static void init_journal(struct dm_integrity_c *ic, unsigned int start_section,
3375                          unsigned int n_sections, unsigned char commit_seq)
3376 {
3377         unsigned int i, j, n;
3378
3379         if (!n_sections)
3380                 return;
3381
3382         for (n = 0; n < n_sections; n++) {
3383                 i = start_section + n;
3384                 wraparound_section(ic, &i);
3385                 for (j = 0; j < ic->journal_section_sectors; j++) {
3386                         struct journal_sector *js = access_journal(ic, i, j);
3387
3388                         BUILD_BUG_ON(sizeof(js->sectors) != JOURNAL_SECTOR_DATA);
3389                         memset(&js->sectors, 0, sizeof(js->sectors));
3390                         js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
3391                 }
3392                 for (j = 0; j < ic->journal_section_entries; j++) {
3393                         struct journal_entry *je = access_journal_entry(ic, i, j);
3394
3395                         journal_entry_set_unused(je);
3396                 }
3397         }
3398
3399         write_journal(ic, start_section, n_sections);
3400 }
3401
3402 static int find_commit_seq(struct dm_integrity_c *ic, unsigned int i, unsigned int j, commit_id_t id)
3403 {
3404         unsigned char k;
3405
3406         for (k = 0; k < N_COMMIT_IDS; k++) {
3407                 if (dm_integrity_commit_id(ic, i, j, k) == id)
3408                         return k;
3409         }
3410         dm_integrity_io_error(ic, "journal commit id", -EIO);
3411         return -EIO;
3412 }
3413
3414 static void replay_journal(struct dm_integrity_c *ic)
3415 {
3416         unsigned int i, j;
3417         bool used_commit_ids[N_COMMIT_IDS];
3418         unsigned int max_commit_id_sections[N_COMMIT_IDS];
3419         unsigned int write_start, write_sections;
3420         unsigned int continue_section;
3421         bool journal_empty;
3422         unsigned char unused, last_used, want_commit_seq;
3423
3424         if (ic->mode == 'R')
3425                 return;
3426
3427         if (ic->journal_uptodate)
3428                 return;
3429
3430         last_used = 0;
3431         write_start = 0;
3432
3433         if (!ic->just_formatted) {
3434                 DEBUG_print("reading journal\n");
3435                 rw_journal(ic, REQ_OP_READ, 0, ic->journal_sections, NULL);
3436                 if (ic->journal_io)
3437                         DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
3438                 if (ic->journal_io) {
3439                         struct journal_completion crypt_comp;
3440
3441                         crypt_comp.ic = ic;
3442                         init_completion(&crypt_comp.comp);
3443                         crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
3444                         encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
3445                         wait_for_completion(&crypt_comp.comp);
3446                 }
3447                 DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
3448         }
3449
3450         if (dm_integrity_failed(ic))
3451                 goto clear_journal;
3452
3453         journal_empty = true;
3454         memset(used_commit_ids, 0, sizeof(used_commit_ids));
3455         memset(max_commit_id_sections, 0, sizeof(max_commit_id_sections));
3456         for (i = 0; i < ic->journal_sections; i++) {
3457                 for (j = 0; j < ic->journal_section_sectors; j++) {
3458                         int k;
3459                         struct journal_sector *js = access_journal(ic, i, j);
3460
3461                         k = find_commit_seq(ic, i, j, js->commit_id);
3462                         if (k < 0)
3463                                 goto clear_journal;
3464                         used_commit_ids[k] = true;
3465                         max_commit_id_sections[k] = i;
3466                 }
3467                 if (journal_empty) {
3468                         for (j = 0; j < ic->journal_section_entries; j++) {
3469                                 struct journal_entry *je = access_journal_entry(ic, i, j);
3470
3471                                 if (!journal_entry_is_unused(je)) {
3472                                         journal_empty = false;
3473                                         break;
3474                                 }
3475                         }
3476                 }
3477         }
3478
3479         if (!used_commit_ids[N_COMMIT_IDS - 1]) {
3480                 unused = N_COMMIT_IDS - 1;
3481                 while (unused && !used_commit_ids[unused - 1])
3482                         unused--;
3483         } else {
3484                 for (unused = 0; unused < N_COMMIT_IDS; unused++)
3485                         if (!used_commit_ids[unused])
3486                                 break;
3487                 if (unused == N_COMMIT_IDS) {
3488                         dm_integrity_io_error(ic, "journal commit ids", -EIO);
3489                         goto clear_journal;
3490                 }
3491         }
3492         DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
3493                     unused, used_commit_ids[0], used_commit_ids[1],
3494                     used_commit_ids[2], used_commit_ids[3]);
3495
3496         last_used = prev_commit_seq(unused);
3497         want_commit_seq = prev_commit_seq(last_used);
3498
3499         if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
3500                 journal_empty = true;
3501
3502         write_start = max_commit_id_sections[last_used] + 1;
3503         if (unlikely(write_start >= ic->journal_sections))
3504                 want_commit_seq = next_commit_seq(want_commit_seq);
3505         wraparound_section(ic, &write_start);
3506
3507         i = write_start;
3508         for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
3509                 for (j = 0; j < ic->journal_section_sectors; j++) {
3510                         struct journal_sector *js = access_journal(ic, i, j);
3511
3512                         if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
3513                                 /*
3514                                  * This could be caused by crash during writing.
3515                                  * We won't replay the inconsistent part of the
3516                                  * journal.
3517                                  */
3518                                 DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
3519                                             i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
3520                                 goto brk;
3521                         }
3522                 }
3523                 i++;
3524                 if (unlikely(i >= ic->journal_sections))
3525                         want_commit_seq = next_commit_seq(want_commit_seq);
3526                 wraparound_section(ic, &i);
3527         }
3528 brk:
3529
3530         if (!journal_empty) {
3531                 DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
3532                             write_sections, write_start, want_commit_seq);
3533                 do_journal_write(ic, write_start, write_sections, true);
3534         }
3535
3536         if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
3537                 continue_section = write_start;
3538                 ic->commit_seq = want_commit_seq;
3539                 DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
3540         } else {
3541                 unsigned int s;
3542                 unsigned char erase_seq;
3543
3544 clear_journal:
3545                 DEBUG_print("clearing journal\n");
3546
3547                 erase_seq = prev_commit_seq(prev_commit_seq(last_used));
3548                 s = write_start;
3549                 init_journal(ic, s, 1, erase_seq);
3550                 s++;
3551                 wraparound_section(ic, &s);
3552                 if (ic->journal_sections >= 2) {
3553                         init_journal(ic, s, ic->journal_sections - 2, erase_seq);
3554                         s += ic->journal_sections - 2;
3555                         wraparound_section(ic, &s);
3556                         init_journal(ic, s, 1, erase_seq);
3557                 }
3558
3559                 continue_section = 0;
3560                 ic->commit_seq = next_commit_seq(erase_seq);
3561         }
3562
3563         ic->committed_section = continue_section;
3564         ic->n_committed_sections = 0;
3565
3566         ic->uncommitted_section = continue_section;
3567         ic->n_uncommitted_sections = 0;
3568
3569         ic->free_section = continue_section;
3570         ic->free_section_entry = 0;
3571         ic->free_sectors = ic->journal_entries;
3572
3573         ic->journal_tree_root = RB_ROOT;
3574         for (i = 0; i < ic->journal_entries; i++)
3575                 init_journal_node(&ic->journal_tree[i]);
3576 }
3577
3578 static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
3579 {
3580         DEBUG_print("%s\n", __func__);
3581
3582         if (ic->mode == 'B') {
3583                 ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
3584                 ic->synchronous_mode = 1;
3585
3586                 cancel_delayed_work_sync(&ic->bitmap_flush_work);
3587                 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
3588                 flush_workqueue(ic->commit_wq);
3589         }
3590 }
3591
3592 static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
3593 {
3594         struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
3595
3596         DEBUG_print("%s\n", __func__);
3597
3598         dm_integrity_enter_synchronous_mode(ic);
3599
3600         return NOTIFY_DONE;
3601 }
3602
3603 static void dm_integrity_postsuspend(struct dm_target *ti)
3604 {
3605         struct dm_integrity_c *ic = ti->private;
3606         int r;
3607
3608         WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
3609
3610         del_timer_sync(&ic->autocommit_timer);
3611
3612         if (ic->recalc_wq)
3613                 drain_workqueue(ic->recalc_wq);
3614
3615         if (ic->mode == 'B')
3616                 cancel_delayed_work_sync(&ic->bitmap_flush_work);
3617
3618         queue_work(ic->commit_wq, &ic->commit_work);
3619         drain_workqueue(ic->commit_wq);
3620
3621         if (ic->mode == 'J') {
3622                 queue_work(ic->writer_wq, &ic->writer_work);
3623                 drain_workqueue(ic->writer_wq);
3624                 dm_integrity_flush_buffers(ic, true);
3625                 if (ic->wrote_to_journal) {
3626                         init_journal(ic, ic->free_section,
3627                                      ic->journal_sections - ic->free_section, ic->commit_seq);
3628                         if (ic->free_section) {
3629                                 init_journal(ic, 0, ic->free_section,
3630                                              next_commit_seq(ic->commit_seq));
3631                         }
3632                 }
3633         }
3634
3635         if (ic->mode == 'B') {
3636                 dm_integrity_flush_buffers(ic, true);
3637 #if 1
3638                 /* set to 0 to test bitmap replay code */
3639                 init_journal(ic, 0, ic->journal_sections, 0);
3640                 ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
3641                 r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA);
3642                 if (unlikely(r))
3643                         dm_integrity_io_error(ic, "writing superblock", r);
3644 #endif
3645         }
3646
3647         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
3648
3649         ic->journal_uptodate = true;
3650 }
3651
3652 static void dm_integrity_resume(struct dm_target *ti)
3653 {
3654         struct dm_integrity_c *ic = ti->private;
3655         __u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
3656         int r;
3657
3658         DEBUG_print("resume\n");
3659
3660         ic->wrote_to_journal = false;
3661
3662         if (ic->provided_data_sectors != old_provided_data_sectors) {
3663                 if (ic->provided_data_sectors > old_provided_data_sectors &&
3664                     ic->mode == 'B' &&
3665                     ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
3666                         rw_journal_sectors(ic, REQ_OP_READ, 0,
3667                                            ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
3668                         block_bitmap_op(ic, ic->journal, old_provided_data_sectors,
3669                                         ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET);
3670                         rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0,
3671                                            ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
3672                 }
3673
3674                 ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
3675                 r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA);
3676                 if (unlikely(r))
3677                         dm_integrity_io_error(ic, "writing superblock", r);
3678         }
3679
3680         if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
3681                 DEBUG_print("resume dirty_bitmap\n");
3682                 rw_journal_sectors(ic, REQ_OP_READ, 0,
3683                                    ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
3684                 if (ic->mode == 'B') {
3685                         if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
3686                             !ic->reset_recalculate_flag) {
3687                                 block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
3688                                 block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
3689                                 if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
3690                                                      BITMAP_OP_TEST_ALL_CLEAR)) {
3691                                         ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
3692                                         ic->sb->recalc_sector = cpu_to_le64(0);
3693                                 }
3694                         } else {
3695                                 DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
3696                                             ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
3697                                 ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
3698                                 block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
3699                                 block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
3700                                 block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
3701                                 rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0,
3702                                                    ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
3703                                 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
3704                                 ic->sb->recalc_sector = cpu_to_le64(0);
3705                         }
3706                 } else {
3707                         if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
3708                               block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) ||
3709                             ic->reset_recalculate_flag) {
3710                                 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
3711                                 ic->sb->recalc_sector = cpu_to_le64(0);
3712                         }
3713                         init_journal(ic, 0, ic->journal_sections, 0);
3714                         replay_journal(ic);
3715                         ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
3716                 }
3717                 r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA);
3718                 if (unlikely(r))
3719                         dm_integrity_io_error(ic, "writing superblock", r);
3720         } else {
3721                 replay_journal(ic);
3722                 if (ic->reset_recalculate_flag) {
3723                         ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
3724                         ic->sb->recalc_sector = cpu_to_le64(0);
3725                 }
3726                 if (ic->mode == 'B') {
3727                         ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
3728                         ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
3729                         r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA);
3730                         if (unlikely(r))
3731                                 dm_integrity_io_error(ic, "writing superblock", r);
3732
3733                         block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
3734                         block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
3735                         block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
3736                         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
3737                             le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors) {
3738                                 block_bitmap_op(ic, ic->journal, le64_to_cpu(ic->sb->recalc_sector),
3739                                                 ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
3740                                 block_bitmap_op(ic, ic->recalc_bitmap, le64_to_cpu(ic->sb->recalc_sector),
3741                                                 ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
3742                                 block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector),
3743                                                 ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
3744                         }
3745                         rw_journal_sectors(ic, REQ_OP_WRITE | REQ_FUA | REQ_SYNC, 0,
3746                                            ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
3747                 }
3748         }
3749
3750         DEBUG_print("testing recalc: %x\n", ic->sb->flags);
3751         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
3752                 __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
3753
3754                 DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors);
3755                 if (recalc_pos < ic->provided_data_sectors) {
3756                         queue_work(ic->recalc_wq, &ic->recalc_work);
3757                 } else if (recalc_pos > ic->provided_data_sectors) {
3758                         ic->sb->recalc_sector = cpu_to_le64(ic->provided_data_sectors);
3759                         recalc_write_super(ic);
3760                 }
3761         }
3762
3763         ic->reboot_notifier.notifier_call = dm_integrity_reboot;
3764         ic->reboot_notifier.next = NULL;
3765         ic->reboot_notifier.priority = INT_MAX - 1;     /* be notified after md and before hardware drivers */
3766         WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
3767
3768 #if 0
3769         /* set to 1 to stress test synchronous mode */
3770         dm_integrity_enter_synchronous_mode(ic);
3771 #endif
3772 }
3773
3774 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
3775                                 unsigned int status_flags, char *result, unsigned int maxlen)
3776 {
3777         struct dm_integrity_c *ic = ti->private;
3778         unsigned int arg_count;
3779         size_t sz = 0;
3780
3781         switch (type) {
3782         case STATUSTYPE_INFO:
3783                 DMEMIT("%llu %llu",
3784                         (unsigned long long)atomic64_read(&ic->number_of_mismatches),
3785                         ic->provided_data_sectors);
3786                 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
3787                         DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector));
3788                 else
3789                         DMEMIT(" -");
3790                 break;
3791
3792         case STATUSTYPE_TABLE: {
3793                 __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
3794
3795                 watermark_percentage += ic->journal_entries / 2;
3796                 do_div(watermark_percentage, ic->journal_entries);
3797                 arg_count = 3;
3798                 arg_count += !!ic->meta_dev;
3799                 arg_count += ic->sectors_per_block != 1;
3800                 arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
3801                 arg_count += ic->reset_recalculate_flag;
3802                 arg_count += ic->discard;
3803                 arg_count += ic->mode == 'J';
3804                 arg_count += ic->mode == 'J';
3805                 arg_count += ic->mode == 'B';
3806                 arg_count += ic->mode == 'B';
3807                 arg_count += !!ic->internal_hash_alg.alg_string;
3808                 arg_count += !!ic->journal_crypt_alg.alg_string;
3809                 arg_count += !!ic->journal_mac_alg.alg_string;
3810                 arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
3811                 arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0;
3812                 arg_count += ic->legacy_recalculate;
3813                 DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start,
3814                        ic->tag_size, ic->mode, arg_count);
3815                 if (ic->meta_dev)
3816                         DMEMIT(" meta_device:%s", ic->meta_dev->name);
3817                 if (ic->sectors_per_block != 1)
3818                         DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
3819                 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
3820                         DMEMIT(" recalculate");
3821                 if (ic->reset_recalculate_flag)
3822                         DMEMIT(" reset_recalculate");
3823                 if (ic->discard)
3824                         DMEMIT(" allow_discards");
3825                 DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
3826                 DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
3827                 DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
3828                 if (ic->mode == 'J') {
3829                         DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage);
3830                         DMEMIT(" commit_time:%u", ic->autocommit_msec);
3831                 }
3832                 if (ic->mode == 'B') {
3833                         DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
3834                         DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
3835                 }
3836                 if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
3837                         DMEMIT(" fix_padding");
3838                 if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0)
3839                         DMEMIT(" fix_hmac");
3840                 if (ic->legacy_recalculate)
3841                         DMEMIT(" legacy_recalculate");
3842
3843 #define EMIT_ALG(a, n)                                                  \
3844                 do {                                                    \
3845                         if (ic->a.alg_string) {                         \
3846                                 DMEMIT(" %s:%s", n, ic->a.alg_string);  \
3847                                 if (ic->a.key_string)                   \
3848                                         DMEMIT(":%s", ic->a.key_string);\
3849                         }                                               \
3850                 } while (0)
3851                 EMIT_ALG(internal_hash_alg, "internal_hash");
3852                 EMIT_ALG(journal_crypt_alg, "journal_crypt");
3853                 EMIT_ALG(journal_mac_alg, "journal_mac");
3854                 break;
3855         }
3856         case STATUSTYPE_IMA:
3857                 DMEMIT_TARGET_NAME_VERSION(ti->type);
3858                 DMEMIT(",dev_name=%s,start=%llu,tag_size=%u,mode=%c",
3859                         ic->dev->name, ic->start, ic->tag_size, ic->mode);
3860
3861                 if (ic->meta_dev)
3862                         DMEMIT(",meta_device=%s", ic->meta_dev->name);
3863                 if (ic->sectors_per_block != 1)
3864                         DMEMIT(",block_size=%u", ic->sectors_per_block << SECTOR_SHIFT);
3865
3866                 DMEMIT(",recalculate=%c", (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ?
3867                        'y' : 'n');
3868                 DMEMIT(",allow_discards=%c", ic->discard ? 'y' : 'n');
3869                 DMEMIT(",fix_padding=%c",
3870                        ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) ? 'y' : 'n');
3871                 DMEMIT(",fix_hmac=%c",
3872                        ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0) ? 'y' : 'n');
3873                 DMEMIT(",legacy_recalculate=%c", ic->legacy_recalculate ? 'y' : 'n');
3874
3875                 DMEMIT(",journal_sectors=%u", ic->initial_sectors - SB_SECTORS);
3876                 DMEMIT(",interleave_sectors=%u", 1U << ic->sb->log2_interleave_sectors);
3877                 DMEMIT(",buffer_sectors=%u", 1U << ic->log2_buffer_sectors);
3878                 DMEMIT(";");
3879                 break;
3880         }
3881 }
3882
3883 static int dm_integrity_iterate_devices(struct dm_target *ti,
3884                                         iterate_devices_callout_fn fn, void *data)
3885 {
3886         struct dm_integrity_c *ic = ti->private;
3887
3888         if (!ic->meta_dev)
3889                 return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
3890         else
3891                 return fn(ti, ic->dev, 0, ti->len, data);
3892 }
3893
3894 static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
3895 {
3896         struct dm_integrity_c *ic = ti->private;
3897
3898         if (ic->sectors_per_block > 1) {
3899                 limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
3900                 limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
3901                 limits->io_min = ic->sectors_per_block << SECTOR_SHIFT;
3902                 limits->dma_alignment = limits->logical_block_size - 1;
3903                 limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT;
3904         }
3905
3906         if (!ic->internal_hash) {
3907                 struct blk_integrity *bi = &limits->integrity;
3908
3909                 memset(bi, 0, sizeof(*bi));
3910                 bi->tuple_size = ic->tag_size;
3911                 bi->tag_size = bi->tuple_size;
3912                 bi->interval_exp =
3913                         ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
3914         }
3915
3916         limits->max_integrity_segments = USHRT_MAX;
3917 }
3918
3919 static void calculate_journal_section_size(struct dm_integrity_c *ic)
3920 {
3921         unsigned int sector_space = JOURNAL_SECTOR_DATA;
3922
3923         ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
3924         ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
3925                                          JOURNAL_ENTRY_ROUNDUP);
3926
3927         if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
3928                 sector_space -= JOURNAL_MAC_PER_SECTOR;
3929         ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
3930         ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
3931         ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
3932         ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
3933 }
3934
3935 static int calculate_device_limits(struct dm_integrity_c *ic)
3936 {
3937         __u64 initial_sectors;
3938
3939         calculate_journal_section_size(ic);
3940         initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
3941         if (initial_sectors + METADATA_PADDING_SECTORS >= ic->meta_device_sectors || initial_sectors > UINT_MAX)
3942                 return -EINVAL;
3943         ic->initial_sectors = initial_sectors;
3944
3945         if (ic->mode == 'I') {
3946                 if (ic->initial_sectors + ic->provided_data_sectors > ic->meta_device_sectors)
3947                         return -EINVAL;
3948         } else if (!ic->meta_dev) {
3949                 sector_t last_sector, last_area, last_offset;
3950
3951                 /* we have to maintain excessive padding for compatibility with existing volumes */
3952                 __u64 metadata_run_padding =
3953                         ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ?
3954                         (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) :
3955                         (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS);
3956
3957                 ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
3958                                             metadata_run_padding) >> SECTOR_SHIFT;
3959                 if (!(ic->metadata_run & (ic->metadata_run - 1)))
3960                         ic->log2_metadata_run = __ffs(ic->metadata_run);
3961                 else
3962                         ic->log2_metadata_run = -1;
3963
3964                 get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
3965                 last_sector = get_data_sector(ic, last_area, last_offset);
3966                 if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
3967                         return -EINVAL;
3968         } else {
3969                 __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
3970
3971                 meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
3972                                 >> (ic->log2_buffer_sectors + SECTOR_SHIFT);
3973                 meta_size <<= ic->log2_buffer_sectors;
3974                 if (ic->initial_sectors + meta_size < ic->initial_sectors ||
3975                     ic->initial_sectors + meta_size > ic->meta_device_sectors)
3976                         return -EINVAL;
3977                 ic->metadata_run = 1;
3978                 ic->log2_metadata_run = 0;
3979         }
3980
3981         return 0;
3982 }
3983
3984 static void get_provided_data_sectors(struct dm_integrity_c *ic)
3985 {
3986         if (!ic->meta_dev) {
3987                 int test_bit;
3988
3989                 ic->provided_data_sectors = 0;
3990                 for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
3991                         __u64 prev_data_sectors = ic->provided_data_sectors;
3992
3993                         ic->provided_data_sectors |= (sector_t)1 << test_bit;
3994                         if (calculate_device_limits(ic))
3995                                 ic->provided_data_sectors = prev_data_sectors;
3996                 }
3997         } else {
3998                 ic->provided_data_sectors = ic->data_device_sectors;
3999                 ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
4000         }
4001 }
4002
4003 static int initialize_superblock(struct dm_integrity_c *ic,
4004                                  unsigned int journal_sectors, unsigned int interleave_sectors)
4005 {
4006         unsigned int journal_sections;
4007         int test_bit;
4008
4009         memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
4010         memcpy(ic->sb->magic, SB_MAGIC, 8);
4011         if (ic->mode == 'I')
4012                 ic->sb->flags |= cpu_to_le32(SB_FLAG_INLINE);
4013         ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
4014         ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
4015         if (ic->journal_mac_alg.alg_string)
4016                 ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
4017
4018         calculate_journal_section_size(ic);
4019         journal_sections = journal_sectors / ic->journal_section_sectors;
4020         if (!journal_sections)
4021                 journal_sections = 1;
4022         if (ic->mode == 'I')
4023                 journal_sections = 0;
4024
4025         if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) {
4026                 ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC);
4027                 get_random_bytes(ic->sb->salt, SALT_SIZE);
4028         }
4029
4030         if (!ic->meta_dev) {
4031                 if (ic->fix_padding)
4032                         ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING);
4033                 ic->sb->journal_sections = cpu_to_le32(journal_sections);
4034                 if (!interleave_sectors)
4035                         interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
4036                 ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
4037                 ic->sb->log2_interleave_sectors = max_t(__u8, MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
4038                 ic->sb->log2_interleave_sectors = min_t(__u8, MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
4039
4040                 get_provided_data_sectors(ic);
4041                 if (!ic->provided_data_sectors)
4042                         return -EINVAL;
4043         } else {
4044                 ic->sb->log2_interleave_sectors = 0;
4045
4046                 get_provided_data_sectors(ic);
4047                 if (!ic->provided_data_sectors)
4048                         return -EINVAL;
4049
4050 try_smaller_buffer:
4051                 ic->sb->journal_sections = cpu_to_le32(0);
4052                 for (test_bit = fls(journal_sections) - 1; test_bit >= 0; test_bit--) {
4053                         __u32 prev_journal_sections = le32_to_cpu(ic->sb->journal_sections);
4054                         __u32 test_journal_sections = prev_journal_sections | (1U << test_bit);
4055
4056                         if (test_journal_sections > journal_sections)
4057                                 continue;
4058                         ic->sb->journal_sections = cpu_to_le32(test_journal_sections);
4059                         if (calculate_device_limits(ic))
4060                                 ic->sb->journal_sections = cpu_to_le32(prev_journal_sections);
4061
4062                 }
4063                 if (!le32_to_cpu(ic->sb->journal_sections)) {
4064                         if (ic->log2_buffer_sectors > 3) {
4065                                 ic->log2_buffer_sectors--;
4066                                 goto try_smaller_buffer;
4067                         }
4068                         return -EINVAL;
4069                 }
4070         }
4071
4072         ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
4073
4074         sb_set_version(ic);
4075
4076         return 0;
4077 }
4078
4079 static void dm_integrity_free_page_list(struct page_list *pl)
4080 {
4081         unsigned int i;
4082
4083         if (!pl)
4084                 return;
4085         for (i = 0; pl[i].page; i++)
4086                 __free_page(pl[i].page);
4087         kvfree(pl);
4088 }
4089
4090 static struct page_list *dm_integrity_alloc_page_list(unsigned int n_pages)
4091 {
4092         struct page_list *pl;
4093         unsigned int i;
4094
4095         pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
4096         if (!pl)
4097                 return NULL;
4098
4099         for (i = 0; i < n_pages; i++) {
4100                 pl[i].page = alloc_page(GFP_KERNEL);
4101                 if (!pl[i].page) {
4102                         dm_integrity_free_page_list(pl);
4103                         return NULL;
4104                 }
4105                 if (i)
4106                         pl[i - 1].next = &pl[i];
4107         }
4108         pl[i].page = NULL;
4109         pl[i].next = NULL;
4110
4111         return pl;
4112 }
4113
4114 static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
4115 {
4116         unsigned int i;
4117
4118         for (i = 0; i < ic->journal_sections; i++)
4119                 kvfree(sl[i]);
4120         kvfree(sl);
4121 }
4122
4123 static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
4124                                                                    struct page_list *pl)
4125 {
4126         struct scatterlist **sl;
4127         unsigned int i;
4128
4129         sl = kvmalloc_array(ic->journal_sections,
4130                             sizeof(struct scatterlist *),
4131                             GFP_KERNEL | __GFP_ZERO);
4132         if (!sl)
4133                 return NULL;
4134
4135         for (i = 0; i < ic->journal_sections; i++) {
4136                 struct scatterlist *s;
4137                 unsigned int start_index, start_offset;
4138                 unsigned int end_index, end_offset;
4139                 unsigned int n_pages;
4140                 unsigned int idx;
4141
4142                 page_list_location(ic, i, 0, &start_index, &start_offset);
4143                 page_list_location(ic, i, ic->journal_section_sectors - 1,
4144                                    &end_index, &end_offset);
4145
4146                 n_pages = (end_index - start_index + 1);
4147
4148                 s = kvmalloc_array(n_pages, sizeof(struct scatterlist),
4149                                    GFP_KERNEL);
4150                 if (!s) {
4151                         dm_integrity_free_journal_scatterlist(ic, sl);
4152                         return NULL;
4153                 }
4154
4155                 sg_init_table(s, n_pages);
4156                 for (idx = start_index; idx <= end_index; idx++) {
4157                         char *va = lowmem_page_address(pl[idx].page);
4158                         unsigned int start = 0, end = PAGE_SIZE;
4159
4160                         if (idx == start_index)
4161                                 start = start_offset;
4162                         if (idx == end_index)
4163                                 end = end_offset + (1 << SECTOR_SHIFT);
4164                         sg_set_buf(&s[idx - start_index], va + start, end - start);
4165                 }
4166
4167                 sl[i] = s;
4168         }
4169
4170         return sl;
4171 }
4172
4173 static void free_alg(struct alg_spec *a)
4174 {
4175         kfree_sensitive(a->alg_string);
4176         kfree_sensitive(a->key);
4177         memset(a, 0, sizeof(*a));
4178 }
4179
4180 static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
4181 {
4182         char *k;
4183
4184         free_alg(a);
4185
4186         a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
4187         if (!a->alg_string)
4188                 goto nomem;
4189
4190         k = strchr(a->alg_string, ':');
4191         if (k) {
4192                 *k = 0;
4193                 a->key_string = k + 1;
4194                 if (strlen(a->key_string) & 1)
4195                         goto inval;
4196
4197                 a->key_size = strlen(a->key_string) / 2;
4198                 a->key = kmalloc(a->key_size, GFP_KERNEL);
4199                 if (!a->key)
4200                         goto nomem;
4201                 if (hex2bin(a->key, a->key_string, a->key_size))
4202                         goto inval;
4203         }
4204
4205         return 0;
4206 inval:
4207         *error = error_inval;
4208         return -EINVAL;
4209 nomem:
4210         *error = "Out of memory for an argument";
4211         return -ENOMEM;
4212 }
4213
4214 static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
4215                    char *error_alg, char *error_key)
4216 {
4217         int r;
4218
4219         if (a->alg_string) {
4220                 *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
4221                 if (IS_ERR(*hash)) {
4222                         *error = error_alg;
4223                         r = PTR_ERR(*hash);
4224                         *hash = NULL;
4225                         return r;
4226                 }
4227
4228                 if (a->key) {
4229                         r = crypto_shash_setkey(*hash, a->key, a->key_size);
4230                         if (r) {
4231                                 *error = error_key;
4232                                 return r;
4233                         }
4234                 } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
4235                         *error = error_key;
4236                         return -ENOKEY;
4237                 }
4238         }
4239
4240         return 0;
4241 }
4242
4243 static int create_journal(struct dm_integrity_c *ic, char **error)
4244 {
4245         int r = 0;
4246         unsigned int i;
4247         __u64 journal_pages, journal_desc_size, journal_tree_size;
4248         unsigned char *crypt_data = NULL, *crypt_iv = NULL;
4249         struct skcipher_request *req = NULL;
4250
4251         ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
4252         ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
4253         ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
4254         ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
4255
4256         journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
4257                                 PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
4258         journal_desc_size = journal_pages * sizeof(struct page_list);
4259         if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) {
4260                 *error = "Journal doesn't fit into memory";
4261                 r = -ENOMEM;
4262                 goto bad;
4263         }
4264         ic->journal_pages = journal_pages;
4265
4266         ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
4267         if (!ic->journal) {
4268                 *error = "Could not allocate memory for journal";
4269                 r = -ENOMEM;
4270                 goto bad;
4271         }
4272         if (ic->journal_crypt_alg.alg_string) {
4273                 unsigned int ivsize, blocksize;
4274                 struct journal_completion comp;
4275
4276                 comp.ic = ic;
4277                 ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
4278                 if (IS_ERR(ic->journal_crypt)) {
4279                         *error = "Invalid journal cipher";
4280                         r = PTR_ERR(ic->journal_crypt);
4281                         ic->journal_crypt = NULL;
4282                         goto bad;
4283                 }
4284                 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
4285                 blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
4286
4287                 if (ic->journal_crypt_alg.key) {
4288                         r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
4289                                                    ic->journal_crypt_alg.key_size);
4290                         if (r) {
4291                                 *error = "Error setting encryption key";
4292                                 goto bad;
4293                         }
4294                 }
4295                 DEBUG_print("cipher %s, block size %u iv size %u\n",
4296                             ic->journal_crypt_alg.alg_string, blocksize, ivsize);
4297
4298                 ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
4299                 if (!ic->journal_io) {
4300                         *error = "Could not allocate memory for journal io";
4301                         r = -ENOMEM;
4302                         goto bad;
4303                 }
4304
4305                 if (blocksize == 1) {
4306                         struct scatterlist *sg;
4307
4308                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
4309                         if (!req) {
4310                                 *error = "Could not allocate crypt request";
4311                                 r = -ENOMEM;
4312                                 goto bad;
4313                         }
4314
4315                         crypt_iv = kzalloc(ivsize, GFP_KERNEL);
4316                         if (!crypt_iv) {
4317                                 *error = "Could not allocate iv";
4318                                 r = -ENOMEM;
4319                                 goto bad;
4320                         }
4321
4322                         ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
4323                         if (!ic->journal_xor) {
4324                                 *error = "Could not allocate memory for journal xor";
4325                                 r = -ENOMEM;
4326                                 goto bad;
4327                         }
4328
4329                         sg = kvmalloc_array(ic->journal_pages + 1,
4330                                             sizeof(struct scatterlist),
4331                                             GFP_KERNEL);
4332                         if (!sg) {
4333                                 *error = "Unable to allocate sg list";
4334                                 r = -ENOMEM;
4335                                 goto bad;
4336                         }
4337                         sg_init_table(sg, ic->journal_pages + 1);
4338                         for (i = 0; i < ic->journal_pages; i++) {
4339                                 char *va = lowmem_page_address(ic->journal_xor[i].page);
4340
4341                                 clear_page(va);
4342                                 sg_set_buf(&sg[i], va, PAGE_SIZE);
4343                         }
4344                         sg_set_buf(&sg[i], &ic->commit_ids, sizeof(ic->commit_ids));
4345
4346                         skcipher_request_set_crypt(req, sg, sg,
4347                                                    PAGE_SIZE * ic->journal_pages + sizeof(ic->commit_ids), crypt_iv);
4348                         init_completion(&comp.comp);
4349                         comp.in_flight = (atomic_t)ATOMIC_INIT(1);
4350                         if (do_crypt(true, req, &comp))
4351                                 wait_for_completion(&comp.comp);
4352                         kvfree(sg);
4353                         r = dm_integrity_failed(ic);
4354                         if (r) {
4355                                 *error = "Unable to encrypt journal";
4356                                 goto bad;
4357                         }
4358                         DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
4359
4360                         crypto_free_skcipher(ic->journal_crypt);
4361                         ic->journal_crypt = NULL;
4362                 } else {
4363                         unsigned int crypt_len = roundup(ivsize, blocksize);
4364
4365                         req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
4366                         if (!req) {
4367                                 *error = "Could not allocate crypt request";
4368                                 r = -ENOMEM;
4369                                 goto bad;
4370                         }
4371
4372                         crypt_iv = kmalloc(ivsize, GFP_KERNEL);
4373                         if (!crypt_iv) {
4374                                 *error = "Could not allocate iv";
4375                                 r = -ENOMEM;
4376                                 goto bad;
4377                         }
4378
4379                         crypt_data = kmalloc(crypt_len, GFP_KERNEL);
4380                         if (!crypt_data) {
4381                                 *error = "Unable to allocate crypt data";
4382                                 r = -ENOMEM;
4383                                 goto bad;
4384                         }
4385
4386                         ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
4387                         if (!ic->journal_scatterlist) {
4388                                 *error = "Unable to allocate sg list";
4389                                 r = -ENOMEM;
4390                                 goto bad;
4391                         }
4392                         ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
4393                         if (!ic->journal_io_scatterlist) {
4394                                 *error = "Unable to allocate sg list";
4395                                 r = -ENOMEM;
4396                                 goto bad;
4397                         }
4398                         ic->sk_requests = kvmalloc_array(ic->journal_sections,
4399                                                          sizeof(struct skcipher_request *),
4400                                                          GFP_KERNEL | __GFP_ZERO);
4401                         if (!ic->sk_requests) {
4402                                 *error = "Unable to allocate sk requests";
4403                                 r = -ENOMEM;
4404                                 goto bad;
4405                         }
4406                         for (i = 0; i < ic->journal_sections; i++) {
4407                                 struct scatterlist sg;
4408                                 struct skcipher_request *section_req;
4409                                 __le32 section_le = cpu_to_le32(i);
4410
4411                                 memset(crypt_iv, 0x00, ivsize);
4412                                 memset(crypt_data, 0x00, crypt_len);
4413                                 memcpy(crypt_data, &section_le, min_t(size_t, crypt_len, sizeof(section_le)));
4414
4415                                 sg_init_one(&sg, crypt_data, crypt_len);
4416                                 skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv);
4417                                 init_completion(&comp.comp);
4418                                 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
4419                                 if (do_crypt(true, req, &comp))
4420                                         wait_for_completion(&comp.comp);
4421
4422                                 r = dm_integrity_failed(ic);
4423                                 if (r) {
4424                                         *error = "Unable to generate iv";
4425                                         goto bad;
4426                                 }
4427
4428                                 section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
4429                                 if (!section_req) {
4430                                         *error = "Unable to allocate crypt request";
4431                                         r = -ENOMEM;
4432                                         goto bad;
4433                                 }
4434                                 section_req->iv = kmalloc_array(ivsize, 2,
4435                                                                 GFP_KERNEL);
4436                                 if (!section_req->iv) {
4437                                         skcipher_request_free(section_req);
4438                                         *error = "Unable to allocate iv";
4439                                         r = -ENOMEM;
4440                                         goto bad;
4441                                 }
4442                                 memcpy(section_req->iv + ivsize, crypt_data, ivsize);
4443                                 section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
4444                                 ic->sk_requests[i] = section_req;
4445                                 DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
4446                         }
4447                 }
4448         }
4449
4450         for (i = 0; i < N_COMMIT_IDS; i++) {
4451                 unsigned int j;
4452
4453 retest_commit_id:
4454                 for (j = 0; j < i; j++) {
4455                         if (ic->commit_ids[j] == ic->commit_ids[i]) {
4456                                 ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
4457                                 goto retest_commit_id;
4458                         }
4459                 }
4460                 DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
4461         }
4462
4463         journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
4464         if (journal_tree_size > ULONG_MAX) {
4465                 *error = "Journal doesn't fit into memory";
4466                 r = -ENOMEM;
4467                 goto bad;
4468         }
4469         ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
4470         if (!ic->journal_tree) {
4471                 *error = "Could not allocate memory for journal tree";
4472                 r = -ENOMEM;
4473         }
4474 bad:
4475         kfree(crypt_data);
4476         kfree(crypt_iv);
4477         skcipher_request_free(req);
4478
4479         return r;
4480 }
4481
4482 /*
4483  * Construct a integrity mapping
4484  *
4485  * Arguments:
4486  *      device
4487  *      offset from the start of the device
4488  *      tag size
4489  *      D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
4490  *      number of optional arguments
4491  *      optional arguments:
4492  *              journal_sectors
4493  *              interleave_sectors
4494  *              buffer_sectors
4495  *              journal_watermark
4496  *              commit_time
4497  *              meta_device
4498  *              block_size
4499  *              sectors_per_bit
4500  *              bitmap_flush_interval
4501  *              internal_hash
4502  *              journal_crypt
4503  *              journal_mac
4504  *              recalculate
4505  */
4506 static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4507 {
4508         struct dm_integrity_c *ic;
4509         char dummy;
4510         int r;
4511         unsigned int extra_args;
4512         struct dm_arg_set as;
4513         static const struct dm_arg _args[] = {
4514                 {0, 18, "Invalid number of feature args"},
4515         };
4516         unsigned int journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
4517         bool should_write_sb;
4518         __u64 threshold;
4519         unsigned long long start;
4520         __s8 log2_sectors_per_bitmap_bit = -1;
4521         __s8 log2_blocks_per_bitmap_bit;
4522         __u64 bits_in_journal;
4523         __u64 n_bitmap_bits;
4524
4525 #define DIRECT_ARGUMENTS        4
4526
4527         if (argc <= DIRECT_ARGUMENTS) {
4528                 ti->error = "Invalid argument count";
4529                 return -EINVAL;
4530         }
4531
4532         ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
4533         if (!ic) {
4534                 ti->error = "Cannot allocate integrity context";
4535                 return -ENOMEM;
4536         }
4537         ti->private = ic;
4538         ti->per_io_data_size = sizeof(struct dm_integrity_io);
4539         ic->ti = ti;
4540
4541         ic->in_progress = RB_ROOT;
4542         INIT_LIST_HEAD(&ic->wait_list);
4543         init_waitqueue_head(&ic->endio_wait);
4544         bio_list_init(&ic->flush_bio_list);
4545         init_waitqueue_head(&ic->copy_to_journal_wait);
4546         init_completion(&ic->crypto_backoff);
4547         atomic64_set(&ic->number_of_mismatches, 0);
4548         ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
4549
4550         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
4551         if (r) {
4552                 ti->error = "Device lookup failed";
4553                 goto bad;
4554         }
4555
4556         if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
4557                 ti->error = "Invalid starting offset";
4558                 r = -EINVAL;
4559                 goto bad;
4560         }
4561         ic->start = start;
4562
4563         if (strcmp(argv[2], "-")) {
4564                 if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
4565                         ti->error = "Invalid tag size";
4566                         r = -EINVAL;
4567                         goto bad;
4568                 }
4569         }
4570
4571         if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
4572             !strcmp(argv[3], "D") || !strcmp(argv[3], "R") ||
4573             !strcmp(argv[3], "I")) {
4574                 ic->mode = argv[3][0];
4575         } else {
4576                 ti->error = "Invalid mode (expecting J, B, D, R, I)";
4577                 r = -EINVAL;
4578                 goto bad;
4579         }
4580
4581         journal_sectors = 0;
4582         interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
4583         buffer_sectors = DEFAULT_BUFFER_SECTORS;
4584         journal_watermark = DEFAULT_JOURNAL_WATERMARK;
4585         sync_msec = DEFAULT_SYNC_MSEC;
4586         ic->sectors_per_block = 1;
4587
4588         as.argc = argc - DIRECT_ARGUMENTS;
4589         as.argv = argv + DIRECT_ARGUMENTS;
4590         r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
4591         if (r)
4592                 goto bad;
4593
4594         while (extra_args--) {
4595                 const char *opt_string;
4596                 unsigned int val;
4597                 unsigned long long llval;
4598
4599                 opt_string = dm_shift_arg(&as);
4600                 if (!opt_string) {
4601                         r = -EINVAL;
4602                         ti->error = "Not enough feature arguments";
4603                         goto bad;
4604                 }
4605                 if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
4606                         journal_sectors = val ? val : 1;
4607                 else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
4608                         interleave_sectors = val;
4609                 else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
4610                         buffer_sectors = val;
4611                 else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
4612                         journal_watermark = val;
4613                 else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
4614                         sync_msec = val;
4615                 else if (!strncmp(opt_string, "meta_device:", strlen("meta_device:"))) {
4616                         if (ic->meta_dev) {
4617                                 dm_put_device(ti, ic->meta_dev);
4618                                 ic->meta_dev = NULL;
4619                         }
4620                         r = dm_get_device(ti, strchr(opt_string, ':') + 1,
4621                                           dm_table_get_mode(ti->table), &ic->meta_dev);
4622                         if (r) {
4623                                 ti->error = "Device lookup failed";
4624                                 goto bad;
4625                         }
4626                 } else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
4627                         if (val < 1 << SECTOR_SHIFT ||
4628                             val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
4629                             (val & (val - 1))) {
4630                                 r = -EINVAL;
4631                                 ti->error = "Invalid block_size argument";
4632                                 goto bad;
4633                         }
4634                         ic->sectors_per_block = val >> SECTOR_SHIFT;
4635                 } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
4636                         log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
4637                 } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
4638                         if ((uint64_t)val >= (uint64_t)UINT_MAX * 1000 / HZ) {
4639                                 r = -EINVAL;
4640                                 ti->error = "Invalid bitmap_flush_interval argument";
4641                                 goto bad;
4642                         }
4643                         ic->bitmap_flush_interval = msecs_to_jiffies(val);
4644                 } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
4645                         r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
4646                                             "Invalid internal_hash argument");
4647                         if (r)
4648                                 goto bad;
4649                 } else if (!strncmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
4650                         r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
4651                                             "Invalid journal_crypt argument");
4652                         if (r)
4653                                 goto bad;
4654                 } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
4655                         r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
4656                                             "Invalid journal_mac argument");
4657                         if (r)
4658                                 goto bad;
4659                 } else if (!strcmp(opt_string, "recalculate")) {
4660                         ic->recalculate_flag = true;
4661                 } else if (!strcmp(opt_string, "reset_recalculate")) {
4662                         ic->recalculate_flag = true;
4663                         ic->reset_recalculate_flag = true;
4664                 } else if (!strcmp(opt_string, "allow_discards")) {
4665                         ic->discard = true;
4666                 } else if (!strcmp(opt_string, "fix_padding")) {
4667                         ic->fix_padding = true;
4668                 } else if (!strcmp(opt_string, "fix_hmac")) {
4669                         ic->fix_hmac = true;
4670                 } else if (!strcmp(opt_string, "legacy_recalculate")) {
4671                         ic->legacy_recalculate = true;
4672                 } else {
4673                         r = -EINVAL;
4674                         ti->error = "Invalid argument";
4675                         goto bad;
4676                 }
4677         }
4678
4679         ic->data_device_sectors = bdev_nr_sectors(ic->dev->bdev);
4680         if (!ic->meta_dev)
4681                 ic->meta_device_sectors = ic->data_device_sectors;
4682         else
4683                 ic->meta_device_sectors = bdev_nr_sectors(ic->meta_dev->bdev);
4684
4685         if (!journal_sectors) {
4686                 journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
4687                                       ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
4688         }
4689
4690         if (!buffer_sectors)
4691                 buffer_sectors = 1;
4692         ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT);
4693
4694         r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
4695                     "Invalid internal hash", "Error setting internal hash key");
4696         if (r)
4697                 goto bad;
4698
4699         r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
4700                     "Invalid journal mac", "Error setting journal mac key");
4701         if (r)
4702                 goto bad;
4703
4704         if (!ic->tag_size) {
4705                 if (!ic->internal_hash) {
4706                         ti->error = "Unknown tag size";
4707                         r = -EINVAL;
4708                         goto bad;
4709                 }
4710                 ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
4711         }
4712         if (ic->tag_size > MAX_TAG_SIZE) {
4713                 ti->error = "Too big tag size";
4714                 r = -EINVAL;
4715                 goto bad;
4716         }
4717         if (!(ic->tag_size & (ic->tag_size - 1)))
4718                 ic->log2_tag_size = __ffs(ic->tag_size);
4719         else
4720                 ic->log2_tag_size = -1;
4721
4722         if (ic->mode == 'I') {
4723                 struct blk_integrity *bi;
4724                 if (ic->meta_dev) {
4725                         r = -EINVAL;
4726                         ti->error = "Metadata device not supported in inline mode";
4727                         goto bad;
4728                 }
4729                 if (!ic->internal_hash_alg.alg_string) {
4730                         r = -EINVAL;
4731                         ti->error = "Internal hash not set in inline mode";
4732                         goto bad;
4733                 }
4734                 if (ic->journal_crypt_alg.alg_string || ic->journal_mac_alg.alg_string) {
4735                         r = -EINVAL;
4736                         ti->error = "Journal crypt not supported in inline mode";
4737                         goto bad;
4738                 }
4739                 if (ic->discard) {
4740                         r = -EINVAL;
4741                         ti->error = "Discards not supported in inline mode";
4742                         goto bad;
4743                 }
4744                 bi = blk_get_integrity(ic->dev->bdev->bd_disk);
4745                 if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) {
4746                         r = -EINVAL;
4747                         ti->error = "Integrity profile not supported";
4748                         goto bad;
4749                 }
4750                 /*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/
4751                 if (bi->tuple_size < ic->tag_size) {
4752                         r = -EINVAL;
4753                         ti->error = "The integrity profile is smaller than tag size";
4754                         goto bad;
4755                 }
4756                 if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) {
4757                         r = -EINVAL;
4758                         ti->error = "Too big tuple size";
4759                         goto bad;
4760                 }
4761                 ic->tuple_size = bi->tuple_size;
4762                 if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) {
4763                         r = -EINVAL;
4764                         ti->error = "Integrity profile sector size mismatch";
4765                         goto bad;
4766                 }
4767         }
4768
4769         if (ic->mode == 'B' && !ic->internal_hash) {
4770                 r = -EINVAL;
4771                 ti->error = "Bitmap mode can be only used with internal hash";
4772                 goto bad;
4773         }
4774
4775         if (ic->discard && !ic->internal_hash) {
4776                 r = -EINVAL;
4777                 ti->error = "Discard can be only used with internal hash";
4778                 goto bad;
4779         }
4780
4781         ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
4782         ic->autocommit_msec = sync_msec;
4783         timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
4784
4785         ic->io = dm_io_client_create();
4786         if (IS_ERR(ic->io)) {
4787                 r = PTR_ERR(ic->io);
4788                 ic->io = NULL;
4789                 ti->error = "Cannot allocate dm io";
4790                 goto bad;
4791         }
4792
4793         r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache);
4794         if (r) {
4795                 ti->error = "Cannot allocate mempool";
4796                 goto bad;
4797         }
4798
4799         r = mempool_init_page_pool(&ic->recheck_pool, 1, ic->mode == 'I' ? 1 : 0);
4800         if (r) {
4801                 ti->error = "Cannot allocate mempool";
4802                 goto bad;
4803         }
4804
4805         if (ic->mode == 'I') {
4806                 r = bioset_init(&ic->recheck_bios, RECHECK_POOL_SIZE, 0, BIOSET_NEED_BVECS);
4807                 if (r) {
4808                         ti->error = "Cannot allocate bio set";
4809                         goto bad;
4810                 }
4811                 r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE);
4812                 if (r) {
4813                         ti->error = "Cannot allocate bio integrity set";
4814                         r = -ENOMEM;
4815                         goto bad;
4816                 }
4817                 r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS);
4818                 if (r) {
4819                         ti->error = "Cannot allocate bio set";
4820                         goto bad;
4821                 }
4822                 r = bioset_integrity_create(&ic->recalc_bios, 1);
4823                 if (r) {
4824                         ti->error = "Cannot allocate bio integrity set";
4825                         r = -ENOMEM;
4826                         goto bad;
4827                 }
4828         }
4829
4830         ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
4831                                           WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
4832         if (!ic->metadata_wq) {
4833                 ti->error = "Cannot allocate workqueue";
4834                 r = -ENOMEM;
4835                 goto bad;
4836         }
4837
4838         /*
4839          * If this workqueue weren't ordered, it would cause bio reordering
4840          * and reduced performance.
4841          */
4842         ic->wait_wq = alloc_ordered_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM);
4843         if (!ic->wait_wq) {
4844                 ti->error = "Cannot allocate workqueue";
4845                 r = -ENOMEM;
4846                 goto bad;
4847         }
4848
4849         ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM,
4850                                           METADATA_WORKQUEUE_MAX_ACTIVE);
4851         if (!ic->offload_wq) {
4852                 ti->error = "Cannot allocate workqueue";
4853                 r = -ENOMEM;
4854                 goto bad;
4855         }
4856
4857         ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
4858         if (!ic->commit_wq) {
4859                 ti->error = "Cannot allocate workqueue";
4860                 r = -ENOMEM;
4861                 goto bad;
4862         }
4863         INIT_WORK(&ic->commit_work, integrity_commit);
4864
4865         if (ic->mode == 'J' || ic->mode == 'B') {
4866                 ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
4867                 if (!ic->writer_wq) {
4868                         ti->error = "Cannot allocate workqueue";
4869                         r = -ENOMEM;
4870                         goto bad;
4871                 }
4872                 INIT_WORK(&ic->writer_work, integrity_writer);
4873         }
4874
4875         ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
4876         if (!ic->sb) {
4877                 r = -ENOMEM;
4878                 ti->error = "Cannot allocate superblock area";
4879                 goto bad;
4880         }
4881
4882         r = sync_rw_sb(ic, REQ_OP_READ);
4883         if (r) {
4884                 ti->error = "Error reading superblock";
4885                 goto bad;
4886         }
4887         should_write_sb = false;
4888         if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
4889                 if (ic->mode != 'R') {
4890                         if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
4891                                 r = -EINVAL;
4892                                 ti->error = "The device is not initialized";
4893                                 goto bad;
4894                         }
4895                 }
4896
4897                 r = initialize_superblock(ic, journal_sectors, interleave_sectors);
4898                 if (r) {
4899                         ti->error = "Could not initialize superblock";
4900                         goto bad;
4901                 }
4902                 if (ic->mode != 'R')
4903                         should_write_sb = true;
4904         }
4905
4906         if (!ic->sb->version || ic->sb->version > SB_VERSION_6) {
4907                 r = -EINVAL;
4908                 ti->error = "Unknown version";
4909                 goto bad;
4910         }
4911         if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_INLINE)) != (ic->mode == 'I')) {
4912                 r = -EINVAL;
4913                 ti->error = "Inline flag mismatch";
4914                 goto bad;
4915         }
4916         if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
4917                 r = -EINVAL;
4918                 ti->error = "Tag size doesn't match the information in superblock";
4919                 goto bad;
4920         }
4921         if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
4922                 r = -EINVAL;
4923                 ti->error = "Block size doesn't match the information in superblock";
4924                 goto bad;
4925         }
4926         if (ic->mode != 'I') {
4927                 if (!le32_to_cpu(ic->sb->journal_sections)) {
4928                         r = -EINVAL;
4929                         ti->error = "Corrupted superblock, journal_sections is 0";
4930                         goto bad;
4931                 }
4932         } else {
4933                 if (le32_to_cpu(ic->sb->journal_sections)) {
4934                         r = -EINVAL;
4935                         ti->error = "Corrupted superblock, journal_sections is not 0";
4936                         goto bad;
4937                 }
4938         }
4939         /* make sure that ti->max_io_len doesn't overflow */
4940         if (!ic->meta_dev) {
4941                 if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
4942                     ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
4943                         r = -EINVAL;
4944                         ti->error = "Invalid interleave_sectors in the superblock";
4945                         goto bad;
4946                 }
4947         } else {
4948                 if (ic->sb->log2_interleave_sectors) {
4949                         r = -EINVAL;
4950                         ti->error = "Invalid interleave_sectors in the superblock";
4951                         goto bad;
4952                 }
4953         }
4954         if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
4955                 r = -EINVAL;
4956                 ti->error = "Journal mac mismatch";
4957                 goto bad;
4958         }
4959
4960         get_provided_data_sectors(ic);
4961         if (!ic->provided_data_sectors) {
4962                 r = -EINVAL;
4963                 ti->error = "The device is too small";
4964                 goto bad;
4965         }
4966
4967 try_smaller_buffer:
4968         r = calculate_device_limits(ic);
4969         if (r) {
4970                 if (ic->meta_dev) {
4971                         if (ic->log2_buffer_sectors > 3) {
4972                                 ic->log2_buffer_sectors--;
4973                                 goto try_smaller_buffer;
4974                         }
4975                 }
4976                 ti->error = "The device is too small";
4977                 goto bad;
4978         }
4979
4980         if (log2_sectors_per_bitmap_bit < 0)
4981                 log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
4982         if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
4983                 log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
4984
4985         bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
4986         if (bits_in_journal > UINT_MAX)
4987                 bits_in_journal = UINT_MAX;
4988         if (bits_in_journal)
4989                 while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
4990                         log2_sectors_per_bitmap_bit++;
4991
4992         log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
4993         ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
4994         if (should_write_sb)
4995                 ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
4996
4997         n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
4998                                 + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
4999         ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
5000
5001         if (!ic->meta_dev)
5002                 ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
5003
5004         if (ti->len > ic->provided_data_sectors) {
5005                 r = -EINVAL;
5006                 ti->error = "Not enough provided sectors for requested mapping size";
5007                 goto bad;
5008         }
5009
5010         threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
5011         threshold += 50;
5012         do_div(threshold, 100);
5013         ic->free_sectors_threshold = threshold;
5014
5015         DEBUG_print("initialized:\n");
5016         DEBUG_print("   integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
5017         DEBUG_print("   journal_entry_size %u\n", ic->journal_entry_size);
5018         DEBUG_print("   journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
5019         DEBUG_print("   journal_section_entries %u\n", ic->journal_section_entries);
5020         DEBUG_print("   journal_section_sectors %u\n", ic->journal_section_sectors);
5021         DEBUG_print("   journal_sections %u\n", (unsigned int)le32_to_cpu(ic->sb->journal_sections));
5022         DEBUG_print("   journal_entries %u\n", ic->journal_entries);
5023         DEBUG_print("   log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
5024         DEBUG_print("   data_device_sectors 0x%llx\n", bdev_nr_sectors(ic->dev->bdev));
5025         DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
5026         DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
5027         DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
5028         DEBUG_print("   provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors);
5029         DEBUG_print("   log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
5030         DEBUG_print("   bits_in_journal %llu\n", bits_in_journal);
5031
5032         if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
5033                 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
5034                 ic->sb->recalc_sector = cpu_to_le64(0);
5035         }
5036
5037         if (ic->internal_hash) {
5038                 ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
5039                 if (!ic->recalc_wq) {
5040                         ti->error = "Cannot allocate workqueue";
5041                         r = -ENOMEM;
5042                         goto bad;
5043                 }
5044                 INIT_WORK(&ic->recalc_work, ic->mode == 'I' ? integrity_recalc_inline : integrity_recalc);
5045         } else {
5046                 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
5047                         ti->error = "Recalculate can only be specified with internal_hash";
5048                         r = -EINVAL;
5049                         goto bad;
5050                 }
5051         }
5052
5053         if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
5054             le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors &&
5055             dm_integrity_disable_recalculate(ic)) {
5056                 ti->error = "Recalculating with HMAC is disabled for security reasons - if you really need it, use the argument \"legacy_recalculate\"";
5057                 r = -EOPNOTSUPP;
5058                 goto bad;
5059         }
5060
5061         ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
5062                         1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0);
5063         if (IS_ERR(ic->bufio)) {
5064                 r = PTR_ERR(ic->bufio);
5065                 ti->error = "Cannot initialize dm-bufio";
5066                 ic->bufio = NULL;
5067                 goto bad;
5068         }
5069         dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
5070
5071         if (ic->mode != 'R' && ic->mode != 'I') {
5072                 r = create_journal(ic, &ti->error);
5073                 if (r)
5074                         goto bad;
5075
5076         }
5077
5078         if (ic->mode == 'B') {
5079                 unsigned int i;
5080                 unsigned int n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
5081
5082                 ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
5083                 if (!ic->recalc_bitmap) {
5084                         r = -ENOMEM;
5085                         goto bad;
5086                 }
5087                 ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
5088                 if (!ic->may_write_bitmap) {
5089                         r = -ENOMEM;
5090                         goto bad;
5091                 }
5092                 ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
5093                 if (!ic->bbs) {
5094                         r = -ENOMEM;
5095                         goto bad;
5096                 }
5097                 INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
5098                 for (i = 0; i < ic->n_bitmap_blocks; i++) {
5099                         struct bitmap_block_status *bbs = &ic->bbs[i];
5100                         unsigned int sector, pl_index, pl_offset;
5101
5102                         INIT_WORK(&bbs->work, bitmap_block_work);
5103                         bbs->ic = ic;
5104                         bbs->idx = i;
5105                         bio_list_init(&bbs->bio_queue);
5106                         spin_lock_init(&bbs->bio_queue_lock);
5107
5108                         sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
5109                         pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
5110                         pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
5111
5112                         bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
5113                 }
5114         }
5115
5116         if (should_write_sb) {
5117                 init_journal(ic, 0, ic->journal_sections, 0);
5118                 r = dm_integrity_failed(ic);
5119                 if (unlikely(r)) {
5120                         ti->error = "Error initializing journal";
5121                         goto bad;
5122                 }
5123                 r = sync_rw_sb(ic, REQ_OP_WRITE | REQ_FUA);
5124                 if (r) {
5125                         ti->error = "Error initializing superblock";
5126                         goto bad;
5127                 }
5128                 ic->just_formatted = true;
5129         }
5130
5131         if (!ic->meta_dev && ic->mode != 'I') {
5132                 r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
5133                 if (r)
5134                         goto bad;
5135         }
5136         if (ic->mode == 'B') {
5137                 unsigned int max_io_len;
5138
5139                 max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
5140                 if (!max_io_len)
5141                         max_io_len = 1U << 31;
5142                 DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
5143                 if (!ti->max_io_len || ti->max_io_len > max_io_len) {
5144                         r = dm_set_target_max_io_len(ti, max_io_len);
5145                         if (r)
5146                                 goto bad;
5147                 }
5148         }
5149
5150         ti->num_flush_bios = 1;
5151         ti->flush_supported = true;
5152         if (ic->discard)
5153                 ti->num_discard_bios = 1;
5154
5155         if (ic->mode == 'I')
5156                 ti->mempool_needs_integrity = true;
5157
5158         dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1);
5159         return 0;
5160
5161 bad:
5162         dm_audit_log_ctr(DM_MSG_PREFIX, ti, 0);
5163         dm_integrity_dtr(ti);
5164         return r;
5165 }
5166
5167 static void dm_integrity_dtr(struct dm_target *ti)
5168 {
5169         struct dm_integrity_c *ic = ti->private;
5170
5171         BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
5172         BUG_ON(!list_empty(&ic->wait_list));
5173
5174         if (ic->mode == 'B')
5175                 cancel_delayed_work_sync(&ic->bitmap_flush_work);
5176         if (ic->metadata_wq)
5177                 destroy_workqueue(ic->metadata_wq);
5178         if (ic->wait_wq)
5179                 destroy_workqueue(ic->wait_wq);
5180         if (ic->offload_wq)
5181                 destroy_workqueue(ic->offload_wq);
5182         if (ic->commit_wq)
5183                 destroy_workqueue(ic->commit_wq);
5184         if (ic->writer_wq)
5185                 destroy_workqueue(ic->writer_wq);
5186         if (ic->recalc_wq)
5187                 destroy_workqueue(ic->recalc_wq);
5188         kvfree(ic->bbs);
5189         if (ic->bufio)
5190                 dm_bufio_client_destroy(ic->bufio);
5191         bioset_exit(&ic->recalc_bios);
5192         bioset_exit(&ic->recheck_bios);
5193         mempool_exit(&ic->recheck_pool);
5194         mempool_exit(&ic->journal_io_mempool);
5195         if (ic->io)
5196                 dm_io_client_destroy(ic->io);
5197         if (ic->dev)
5198                 dm_put_device(ti, ic->dev);
5199         if (ic->meta_dev)
5200                 dm_put_device(ti, ic->meta_dev);
5201         dm_integrity_free_page_list(ic->journal);
5202         dm_integrity_free_page_list(ic->journal_io);
5203         dm_integrity_free_page_list(ic->journal_xor);
5204         dm_integrity_free_page_list(ic->recalc_bitmap);
5205         dm_integrity_free_page_list(ic->may_write_bitmap);
5206         if (ic->journal_scatterlist)
5207                 dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
5208         if (ic->journal_io_scatterlist)
5209                 dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
5210         if (ic->sk_requests) {
5211                 unsigned int i;
5212
5213                 for (i = 0; i < ic->journal_sections; i++) {
5214                         struct skcipher_request *req;
5215
5216                         req = ic->sk_requests[i];
5217                         if (req) {
5218                                 kfree_sensitive(req->iv);
5219                                 skcipher_request_free(req);
5220                         }
5221                 }
5222                 kvfree(ic->sk_requests);
5223         }
5224         kvfree(ic->journal_tree);
5225         if (ic->sb)
5226                 free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
5227
5228         if (ic->internal_hash)
5229                 crypto_free_shash(ic->internal_hash);
5230         free_alg(&ic->internal_hash_alg);
5231
5232         if (ic->journal_crypt)
5233                 crypto_free_skcipher(ic->journal_crypt);
5234         free_alg(&ic->journal_crypt_alg);
5235
5236         if (ic->journal_mac)
5237                 crypto_free_shash(ic->journal_mac);
5238         free_alg(&ic->journal_mac_alg);
5239
5240         kfree(ic);
5241         dm_audit_log_dtr(DM_MSG_PREFIX, ti, 1);
5242 }
5243
5244 static struct target_type integrity_target = {
5245         .name                   = "integrity",
5246         .version                = {1, 13, 0},
5247         .module                 = THIS_MODULE,
5248         .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
5249         .ctr                    = dm_integrity_ctr,
5250         .dtr                    = dm_integrity_dtr,
5251         .map                    = dm_integrity_map,
5252         .end_io                 = dm_integrity_end_io,
5253         .postsuspend            = dm_integrity_postsuspend,
5254         .resume                 = dm_integrity_resume,
5255         .status                 = dm_integrity_status,
5256         .iterate_devices        = dm_integrity_iterate_devices,
5257         .io_hints               = dm_integrity_io_hints,
5258 };
5259
5260 static int __init dm_integrity_init(void)
5261 {
5262         int r;
5263
5264         journal_io_cache = kmem_cache_create("integrity_journal_io",
5265                                              sizeof(struct journal_io), 0, 0, NULL);
5266         if (!journal_io_cache) {
5267                 DMERR("can't allocate journal io cache");
5268                 return -ENOMEM;
5269         }
5270
5271         r = dm_register_target(&integrity_target);
5272         if (r < 0) {
5273                 kmem_cache_destroy(journal_io_cache);
5274                 return r;
5275         }
5276
5277         return 0;
5278 }
5279
5280 static void __exit dm_integrity_exit(void)
5281 {
5282         dm_unregister_target(&integrity_target);
5283         kmem_cache_destroy(journal_io_cache);
5284 }
5285
5286 module_init(dm_integrity_init);
5287 module_exit(dm_integrity_exit);
5288
5289 MODULE_AUTHOR("Milan Broz");
5290 MODULE_AUTHOR("Mikulas Patocka");
5291 MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
5292 MODULE_LICENSE("GPL");
This page took 0.368315 seconds and 4 git commands to generate.