drivers/md/dm-vdo/block-map.h

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright 2023 Red Hat
   4  */
   5
   6 #ifndef VDO_BLOCK_MAP_H
   7 #define VDO_BLOCK_MAP_H
   8
   9 #include <linux/list.h>
  10
  11 #include "numeric.h"
  12
  13 #include "admin-state.h"
  14 #include "completion.h"
  15 #include "encodings.h"
  16 #include "int-map.h"
  17 #include "statistics.h"
  18 #include "types.h"
  19 #include "vio.h"
  20 #include "wait-queue.h"
  21
  22 /*
  23  * The block map is responsible for tracking all the logical to physical mappings of a VDO. It
  24  * consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
  25  * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
  26  * each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
  27  *
  28  * Each logical zone has a single dedicated queue and thread for performing all updates to the
  29  * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
  30  * allow the code to omit more fine-grained locking for the block map structures.
  31  *
  32  * Load operations must be performed on the admin thread. Normal operations, such as reading and
  33  * updating mappings, must be performed on the appropriate logical zone thread. Save operations
  34  * must be launched from the same admin thread as the original load operation.
  35  */
  36
  37 enum {
  38         BLOCK_MAP_VIO_POOL_SIZE = 64,
  39 };
  40
  41 /*
  42  * Generation counter for page references.
  43  */
  44 typedef u32 vdo_page_generation;
  45
  46 extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY;
  47
  48 /* The VDO Page Cache abstraction. */
  49 struct vdo_page_cache {
  50         /* the VDO which owns this cache */
  51         struct vdo *vdo;
  52         /* number of pages in cache */
  53         page_count_t page_count;
  54         /* number of pages to write in the current batch */
  55         page_count_t pages_in_batch;
  56         /* Whether the VDO is doing a read-only rebuild */
  57         bool rebuilding;
  58
  59         /* array of page information entries */
  60         struct page_info *infos;
  61         /* raw memory for pages */
  62         char *pages;
  63         /* cache last found page info */
  64         struct page_info *last_found;
  65         /* map of page number to info */
  66         struct int_map *page_map;
  67         /* main LRU list (all infos) */
  68         struct list_head lru_list;
  69         /* free page list (oldest first) */
  70         struct list_head free_list;
  71         /* outgoing page list */
  72         struct list_head outgoing_list;
  73         /* number of read I/O operations pending */
  74         page_count_t outstanding_reads;
  75         /* number of write I/O operations pending */
  76         page_count_t outstanding_writes;
  77         /* number of pages covered by the current flush */
  78         page_count_t pages_in_flush;
  79         /* number of pages waiting to be included in the next flush */
  80         page_count_t pages_to_flush;
  81         /* number of discards in progress */
  82         unsigned int discard_count;
  83         /* how many VPCs waiting for free page */
  84         unsigned int waiter_count;
  85         /* queue of waiters who want a free page */
  86         struct vdo_wait_queue free_waiters;
  87         /*
  88          * Statistics are only updated on the logical zone thread, but are accessed from other
  89          * threads.
  90          */
  91         struct block_map_statistics stats;
  92         /* counter for pressure reports */
  93         u32 pressure_report;
  94         /* the block map zone to which this cache belongs */
  95         struct block_map_zone *zone;
  96 };
  97
  98 /*
  99  * The state of a page buffer. If the page buffer is free no particular page is bound to it,
 100  * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
 101  * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
 102  * in flight (incoming or outgoing) and its data should not be accessed.
 103  *
 104  * @note Update the static data in get_page_state_name() if you change this enumeration.
 105  */
 106 enum vdo_page_buffer_state {
 107         /* this page buffer is not being used */
 108         PS_FREE,
 109         /* this page is being read from store */
 110         PS_INCOMING,
 111         /* attempt to load this page failed */
 112         PS_FAILED,
 113         /* this page is valid and un-modified */
 114         PS_RESIDENT,
 115         /* this page is valid and modified */
 116         PS_DIRTY,
 117         /* this page is being written and should not be used */
 118         PS_OUTGOING,
 119         /* not a state */
 120         PAGE_STATE_COUNT,
 121 } __packed;
 122
 123 /*
 124  * The write status of page
 125  */
 126 enum vdo_page_write_status {
 127         WRITE_STATUS_NORMAL,
 128         WRITE_STATUS_DISCARD,
 129         WRITE_STATUS_DEFERRED,
 130 } __packed;
 131
 132 /* Per-page-slot information. */
 133 struct page_info {
 134         /* Preallocated page struct vio */
 135         struct vio *vio;
 136         /* back-link for references */
 137         struct vdo_page_cache *cache;
 138         /* the pbn of the page */
 139         physical_block_number_t pbn;
 140         /* page is busy (temporarily locked) */
 141         u16 busy;
 142         /* the write status the page */
 143         enum vdo_page_write_status write_status;
 144         /* page state */
 145         enum vdo_page_buffer_state state;
 146         /* queue of completions awaiting this item */
 147         struct vdo_wait_queue waiting;
 148         /* state linked list entry */
 149         struct list_head state_entry;
 150         /* LRU entry */
 151         struct list_head lru_entry;
 152         /*
 153          * The earliest recovery journal block containing uncommitted updates to the block map page
 154          * associated with this page_info. A reference (lock) is held on that block to prevent it
 155          * from being reaped. When this value changes, the reference on the old value must be
 156          * released and a reference on the new value must be acquired.
 157          */
 158         sequence_number_t recovery_lock;
 159 };
 160
 161 /*
 162  * A completion awaiting a specific page. Also a live reference into the page once completed, until
 163  * freed.
 164  */
 165 struct vdo_page_completion {
 166         /* The generic completion */
 167         struct vdo_completion completion;
 168         /* The cache involved */
 169         struct vdo_page_cache *cache;
 170         /* The waiter for the pending list */
 171         struct vdo_waiter waiter;
 172         /* The absolute physical block number of the page on disk */
 173         physical_block_number_t pbn;
 174         /* Whether the page may be modified */
 175         bool writable;
 176         /* Whether the page is available */
 177         bool ready;
 178         /* The info structure for the page, only valid when ready */
 179         struct page_info *info;
 180 };
 181
 182 struct forest;
 183
 184 struct tree_page {
 185         struct vdo_waiter waiter;
 186
 187         /* Dirty list entry */
 188         struct list_head entry;
 189
 190         /* If dirty, the tree zone flush generation in which it was last dirtied. */
 191         u8 generation;
 192
 193         /* Whether this page is an interior tree page being written out. */
 194         bool writing;
 195
 196         /* If writing, the tree zone flush generation of the copy being written. */
 197         u8 writing_generation;
 198
 199         /*
 200          * Sequence number of the earliest recovery journal block containing uncommitted updates to
 201          * this page
 202          */
 203         sequence_number_t recovery_lock;
 204
 205         /* The value of recovery_lock when the this page last started writing */
 206         sequence_number_t writing_recovery_lock;
 207
 208         char page_buffer[VDO_BLOCK_SIZE];
 209 };
 210
 211 enum block_map_page_type {
 212         VDO_TREE_PAGE,
 213         VDO_CACHE_PAGE,
 214 };
 215
 216 typedef struct list_head dirty_era_t[2];
 217
 218 struct dirty_lists {
 219         /* The number of periods after which an element will be expired */
 220         block_count_t maximum_age;
 221         /* The oldest period which has unexpired elements */
 222         sequence_number_t oldest_period;
 223         /* One more than the current period */
 224         sequence_number_t next_period;
 225         /* The offset in the array of lists of the oldest period */
 226         block_count_t offset;
 227         /* Expired pages */
 228         dirty_era_t expired;
 229         /* The lists of dirty pages */
 230         dirty_era_t eras[];
 231 };
 232
 233 struct block_map_zone {
 234         zone_count_t zone_number;
 235         thread_id_t thread_id;
 236         struct admin_state state;
 237         struct block_map *block_map;
 238         /* Dirty pages, by era*/
 239         struct dirty_lists *dirty_lists;
 240         struct vdo_page_cache page_cache;
 241         data_vio_count_t active_lookups;
 242         struct int_map *loading_pages;
 243         struct vio_pool *vio_pool;
 244         /* The tree page which has issued or will be issuing a flush */
 245         struct tree_page *flusher;
 246         struct vdo_wait_queue flush_waiters;
 247         /* The generation after the most recent flush */
 248         u8 generation;
 249         u8 oldest_generation;
 250         /* The counts of dirty pages in each generation */
 251         u32 dirty_page_counts[256];
 252 };
 253
 254 struct block_map {
 255         struct vdo *vdo;
 256         struct action_manager *action_manager;
 257         /* The absolute PBN of the first root of the tree part of the block map */
 258         physical_block_number_t root_origin;
 259         block_count_t root_count;
 260
 261         /* The era point we are currently distributing to the zones */
 262         sequence_number_t current_era_point;
 263         /* The next era point */
 264         sequence_number_t pending_era_point;
 265
 266         /* The number of entries in block map */
 267         block_count_t entry_count;
 268         nonce_t nonce;
 269         struct recovery_journal *journal;
 270
 271         /* The trees for finding block map pages */
 272         struct forest *forest;
 273         /* The expanded trees awaiting growth */
 274         struct forest *next_forest;
 275         /* The number of entries after growth */
 276         block_count_t next_entry_count;
 277
 278         zone_count_t zone_count;
 279         struct block_map_zone zones[];
 280 };
 281
 282 /**
 283  * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
 284  *                                 the forest.
 285  * @pbn: A PBN of a tree node.
 286  * @completion: The parent completion of the traversal.
 287  *
 288  * Return: VDO_SUCCESS or an error.
 289  */
 290 typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn,
 291                                      struct vdo_completion *completion);
 292
 293 static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
 294 {
 295         vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
 296         return container_of(completion, struct vdo_page_completion, completion);
 297 }
 298
 299 void vdo_release_page_completion(struct vdo_completion *completion);
 300
 301 void vdo_get_page(struct vdo_page_completion *page_completion,
 302                   struct block_map_zone *zone, physical_block_number_t pbn,
 303                   bool writable, void *parent, vdo_action_fn callback,
 304                   vdo_action_fn error_handler, bool requeue);
 305
 306 void vdo_request_page_write(struct vdo_completion *completion);
 307
 308 int __must_check vdo_get_cached_page(struct vdo_completion *completion,
 309                                      struct block_map_page **page_ptr);
 310
 311 int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);
 312
 313 static inline struct block_map_page * __must_check
 314 vdo_as_block_map_page(struct tree_page *tree_page)
 315 {
 316         return (struct block_map_page *) tree_page->page_buffer;
 317 }
 318
 319 bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
 320                          physical_block_number_t pbn,
 321                          struct block_map_page *page);
 322
 323 void vdo_find_block_map_slot(struct data_vio *data_vio);
 324
 325 physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
 326                                                     page_number_t page_number);
 327
 328 void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);
 329
 330 void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
 331                          struct vdo_completion *completion);
 332
 333 int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
 334                                       block_count_t logical_blocks, struct vdo *vdo,
 335                                       struct recovery_journal *journal, nonce_t nonce,
 336                                       page_count_t cache_size, block_count_t maximum_age,
 337                                       struct block_map **map_ptr);
 338
 339 void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
 340                          struct vdo_completion *parent);
 341
 342 void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);
 343
 344 int __must_check vdo_prepare_to_grow_block_map(struct block_map *map,
 345                                                block_count_t new_logical_blocks);
 346
 347 void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);
 348
 349 void vdo_abandon_block_map_growth(struct block_map *map);
 350
 351 void vdo_free_block_map(struct block_map *map);
 352
 353 struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map);
 354
 355 void vdo_initialize_block_map_from_journal(struct block_map *map,
 356                                            struct recovery_journal *journal);
 357
 358 zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);
 359
 360 void vdo_advance_block_map_era(struct block_map *map,
 361                                sequence_number_t recovery_block_number);
 362
 363 void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
 364                                physical_block_number_t pbn,
 365                                enum block_mapping_state mapping_state,
 366                                sequence_number_t *recovery_lock);
 367
 368 void vdo_get_mapped_block(struct data_vio *data_vio);
 369
 370 void vdo_put_mapped_block(struct data_vio *data_vio);
 371
 372 struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);
 373
 374 /**
 375  * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
 376  * @age: The configured maximum age
 377  *
 378  * Return: The converted age
 379  *
 380  * In the old recovery journal format, each journal block held 311 entries, and every write bio
 381  * made two entries. The old maximum age was half the usable journal length. In the new format,
 382  * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
 383  * age so that the number of writes in a block map era is the same in the old and new formats. This
 384  * keeps the bound on the amount of work required to recover the block map from the recovery
 385  * journal the same across the format change. It also keeps the amortization of block map page
 386  * writes to write bios the same.
 387  */
 388 static inline block_count_t vdo_convert_maximum_age(block_count_t age)
 389 {
 390         return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
 391                             2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
 392 }
 393
 394 #endif /* VDO_BLOCK_MAP_H */