]>
Commit | Line | Data |
---|---|---|
9d0eb0ab JR |
1 | /* |
2 | * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. | |
3 | * Copyright (C) 2011-2013 Red Hat, Inc. | |
4 | * | |
5 | * This file is released under the GPL. | |
6 | * | |
7 | * dm-switch is a device-mapper target that maps IO to underlying block | |
8 | * devices efficiently when there are a large number of fixed-sized | |
9 | * address regions but there is no simple pattern to allow for a compact | |
10 | * mapping representation such as dm-stripe. | |
11 | */ | |
12 | ||
13 | #include <linux/device-mapper.h> | |
14 | ||
15 | #include <linux/module.h> | |
16 | #include <linux/init.h> | |
17 | #include <linux/vmalloc.h> | |
18 | ||
19 | #define DM_MSG_PREFIX "switch" | |
20 | ||
21 | /* | |
22 | * One region_table_slot_t holds <region_entries_per_slot> region table | |
23 | * entries each of which is <region_table_entry_bits> in size. | |
24 | */ | |
25 | typedef unsigned long region_table_slot_t; | |
26 | ||
27 | /* | |
28 | * A device with the offset to its start sector. | |
29 | */ | |
30 | struct switch_path { | |
31 | struct dm_dev *dmdev; | |
32 | sector_t start; | |
33 | }; | |
34 | ||
35 | /* | |
36 | * Context block for a dm switch device. | |
37 | */ | |
38 | struct switch_ctx { | |
39 | struct dm_target *ti; | |
40 | ||
41 | unsigned nr_paths; /* Number of paths in path_list. */ | |
42 | ||
43 | unsigned region_size; /* Region size in 512-byte sectors */ | |
44 | unsigned long nr_regions; /* Number of regions making up the device */ | |
45 | signed char region_size_bits; /* log2 of region_size or -1 */ | |
46 | ||
47 | unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ | |
48 | unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ | |
49 | signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ | |
50 | ||
51 | region_table_slot_t *region_table; /* Region table */ | |
52 | ||
53 | /* | |
54 | * Array of dm devices to switch between. | |
55 | */ | |
56 | struct switch_path path_list[0]; | |
57 | }; | |
58 | ||
59 | static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, | |
60 | unsigned region_size) | |
61 | { | |
62 | struct switch_ctx *sctx; | |
63 | ||
64 | sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path), | |
65 | GFP_KERNEL); | |
66 | if (!sctx) | |
67 | return NULL; | |
68 | ||
69 | sctx->ti = ti; | |
70 | sctx->region_size = region_size; | |
71 | ||
72 | ti->private = sctx; | |
73 | ||
74 | return sctx; | |
75 | } | |
76 | ||
77 | static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) | |
78 | { | |
79 | struct switch_ctx *sctx = ti->private; | |
80 | sector_t nr_regions = ti->len; | |
81 | sector_t nr_slots; | |
82 | ||
83 | if (!(sctx->region_size & (sctx->region_size - 1))) | |
84 | sctx->region_size_bits = __ffs(sctx->region_size); | |
85 | else | |
86 | sctx->region_size_bits = -1; | |
87 | ||
88 | sctx->region_table_entry_bits = 1; | |
89 | while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && | |
90 | (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) | |
91 | sctx->region_table_entry_bits++; | |
92 | ||
93 | sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; | |
94 | if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) | |
95 | sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); | |
96 | else | |
97 | sctx->region_entries_per_slot_bits = -1; | |
98 | ||
99 | if (sector_div(nr_regions, sctx->region_size)) | |
100 | nr_regions++; | |
101 | ||
102 | sctx->nr_regions = nr_regions; | |
103 | if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) { | |
104 | ti->error = "Region table too large"; | |
105 | return -EINVAL; | |
106 | } | |
107 | ||
108 | nr_slots = nr_regions; | |
109 | if (sector_div(nr_slots, sctx->region_entries_per_slot)) | |
110 | nr_slots++; | |
111 | ||
112 | if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { | |
113 | ti->error = "Region table too large"; | |
114 | return -EINVAL; | |
115 | } | |
116 | ||
117 | sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t)); | |
118 | if (!sctx->region_table) { | |
119 | ti->error = "Cannot allocate region table"; | |
120 | return -ENOMEM; | |
121 | } | |
122 | ||
123 | return 0; | |
124 | } | |
125 | ||
126 | static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, | |
127 | unsigned long *region_index, unsigned *bit) | |
128 | { | |
129 | if (sctx->region_entries_per_slot_bits >= 0) { | |
130 | *region_index = region_nr >> sctx->region_entries_per_slot_bits; | |
131 | *bit = region_nr & (sctx->region_entries_per_slot - 1); | |
132 | } else { | |
133 | *region_index = region_nr / sctx->region_entries_per_slot; | |
134 | *bit = region_nr % sctx->region_entries_per_slot; | |
135 | } | |
136 | ||
137 | *bit *= sctx->region_table_entry_bits; | |
138 | } | |
139 | ||
140 | /* | |
141 | * Find which path to use at given offset. | |
142 | */ | |
143 | static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) | |
144 | { | |
145 | unsigned long region_index; | |
146 | unsigned bit, path_nr; | |
147 | sector_t p; | |
148 | ||
149 | p = offset; | |
150 | if (sctx->region_size_bits >= 0) | |
151 | p >>= sctx->region_size_bits; | |
152 | else | |
153 | sector_div(p, sctx->region_size); | |
154 | ||
155 | switch_get_position(sctx, p, ®ion_index, &bit); | |
156 | path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & | |
157 | ((1 << sctx->region_table_entry_bits) - 1); | |
158 | ||
159 | /* This can only happen if the processor uses non-atomic stores. */ | |
160 | if (unlikely(path_nr >= sctx->nr_paths)) | |
161 | path_nr = 0; | |
162 | ||
163 | return path_nr; | |
164 | } | |
165 | ||
166 | static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, | |
167 | unsigned value) | |
168 | { | |
169 | unsigned long region_index; | |
170 | unsigned bit; | |
171 | region_table_slot_t pte; | |
172 | ||
173 | switch_get_position(sctx, region_nr, ®ion_index, &bit); | |
174 | ||
175 | pte = sctx->region_table[region_index]; | |
176 | pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); | |
177 | pte |= (region_table_slot_t)value << bit; | |
178 | sctx->region_table[region_index] = pte; | |
179 | } | |
180 | ||
181 | /* | |
182 | * Fill the region table with an initial round robin pattern. | |
183 | */ | |
184 | static void initialise_region_table(struct switch_ctx *sctx) | |
185 | { | |
186 | unsigned path_nr = 0; | |
187 | unsigned long region_nr; | |
188 | ||
189 | for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { | |
190 | switch_region_table_write(sctx, region_nr, path_nr); | |
191 | if (++path_nr >= sctx->nr_paths) | |
192 | path_nr = 0; | |
193 | } | |
194 | } | |
195 | ||
196 | static int parse_path(struct dm_arg_set *as, struct dm_target *ti) | |
197 | { | |
198 | struct switch_ctx *sctx = ti->private; | |
199 | unsigned long long start; | |
200 | int r; | |
201 | ||
202 | r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), | |
203 | &sctx->path_list[sctx->nr_paths].dmdev); | |
204 | if (r) { | |
205 | ti->error = "Device lookup failed"; | |
206 | return r; | |
207 | } | |
208 | ||
209 | if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { | |
210 | ti->error = "Invalid device starting offset"; | |
211 | dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); | |
212 | return -EINVAL; | |
213 | } | |
214 | ||
215 | sctx->path_list[sctx->nr_paths].start = start; | |
216 | ||
217 | sctx->nr_paths++; | |
218 | ||
219 | return 0; | |
220 | } | |
221 | ||
222 | /* | |
223 | * Destructor: Don't free the dm_target, just the ti->private data (if any). | |
224 | */ | |
225 | static void switch_dtr(struct dm_target *ti) | |
226 | { | |
227 | struct switch_ctx *sctx = ti->private; | |
228 | ||
229 | while (sctx->nr_paths--) | |
230 | dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); | |
231 | ||
232 | vfree(sctx->region_table); | |
233 | kfree(sctx); | |
234 | } | |
235 | ||
236 | /* | |
237 | * Constructor arguments: | |
238 | * <num_paths> <region_size> <num_optional_args> [<optional_args>...] | |
239 | * [<dev_path> <offset>]+ | |
240 | * | |
241 | * Optional args are to allow for future extension: currently this | |
242 | * parameter must be 0. | |
243 | */ | |
244 | static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) | |
245 | { | |
246 | static struct dm_arg _args[] = { | |
247 | {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, | |
248 | {1, UINT_MAX, "Invalid region size"}, | |
249 | {0, 0, "Invalid number of optional args"}, | |
250 | }; | |
251 | ||
252 | struct switch_ctx *sctx; | |
253 | struct dm_arg_set as; | |
254 | unsigned nr_paths, region_size, nr_optional_args; | |
255 | int r; | |
256 | ||
257 | as.argc = argc; | |
258 | as.argv = argv; | |
259 | ||
260 | r = dm_read_arg(_args, &as, &nr_paths, &ti->error); | |
261 | if (r) | |
262 | return -EINVAL; | |
263 | ||
264 | r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); | |
265 | if (r) | |
266 | return r; | |
267 | ||
268 | r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); | |
269 | if (r) | |
270 | return r; | |
271 | /* parse optional arguments here, if we add any */ | |
272 | ||
273 | if (as.argc != nr_paths * 2) { | |
274 | ti->error = "Incorrect number of path arguments"; | |
275 | return -EINVAL; | |
276 | } | |
277 | ||
278 | sctx = alloc_switch_ctx(ti, nr_paths, region_size); | |
279 | if (!sctx) { | |
280 | ti->error = "Cannot allocate redirection context"; | |
281 | return -ENOMEM; | |
282 | } | |
283 | ||
284 | r = dm_set_target_max_io_len(ti, region_size); | |
285 | if (r) | |
286 | goto error; | |
287 | ||
288 | while (as.argc) { | |
289 | r = parse_path(&as, ti); | |
290 | if (r) | |
291 | goto error; | |
292 | } | |
293 | ||
294 | r = alloc_region_table(ti, nr_paths); | |
295 | if (r) | |
296 | goto error; | |
297 | ||
298 | initialise_region_table(sctx); | |
299 | ||
300 | /* For UNMAP, sending the request down any path is sufficient */ | |
301 | ti->num_discard_bios = 1; | |
302 | ||
303 | return 0; | |
304 | ||
305 | error: | |
306 | switch_dtr(ti); | |
307 | ||
308 | return r; | |
309 | } | |
310 | ||
311 | static int switch_map(struct dm_target *ti, struct bio *bio) | |
312 | { | |
313 | struct switch_ctx *sctx = ti->private; | |
314 | sector_t offset = dm_target_offset(ti, bio->bi_sector); | |
315 | unsigned path_nr = switch_get_path_nr(sctx, offset); | |
316 | ||
317 | bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; | |
318 | bio->bi_sector = sctx->path_list[path_nr].start + offset; | |
319 | ||
320 | return DM_MAPIO_REMAPPED; | |
321 | } | |
322 | ||
323 | /* | |
324 | * We need to parse hex numbers in the message as quickly as possible. | |
325 | * | |
326 | * This table-based hex parser improves performance. | |
327 | * It improves a time to load 1000000 entries compared to the condition-based | |
328 | * parser. | |
329 | * table-based parser condition-based parser | |
330 | * PA-RISC 0.29s 0.31s | |
331 | * Opteron 0.0495s 0.0498s | |
332 | */ | |
333 | static const unsigned char hex_table[256] = { | |
334 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
335 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
336 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
337 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, | |
338 | 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
339 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
340 | 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
341 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
342 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
343 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
344 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
345 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
346 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
347 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
348 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
349 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 | |
350 | }; | |
351 | ||
352 | static __always_inline unsigned long parse_hex(const char **string) | |
353 | { | |
354 | unsigned char d; | |
355 | unsigned long r = 0; | |
356 | ||
357 | while ((d = hex_table[(unsigned char)**string]) < 16) { | |
358 | r = (r << 4) | d; | |
359 | (*string)++; | |
360 | } | |
361 | ||
362 | return r; | |
363 | } | |
364 | ||
365 | static int process_set_region_mappings(struct switch_ctx *sctx, | |
366 | unsigned argc, char **argv) | |
367 | { | |
368 | unsigned i; | |
369 | unsigned long region_index = 0; | |
370 | ||
371 | for (i = 1; i < argc; i++) { | |
372 | unsigned long path_nr; | |
373 | const char *string = argv[i]; | |
374 | ||
375 | if (*string == ':') | |
376 | region_index++; | |
377 | else { | |
378 | region_index = parse_hex(&string); | |
379 | if (unlikely(*string != ':')) { | |
380 | DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); | |
381 | return -EINVAL; | |
382 | } | |
383 | } | |
384 | ||
385 | string++; | |
386 | if (unlikely(!*string)) { | |
387 | DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); | |
388 | return -EINVAL; | |
389 | } | |
390 | ||
391 | path_nr = parse_hex(&string); | |
392 | if (unlikely(*string)) { | |
393 | DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); | |
394 | return -EINVAL; | |
395 | } | |
396 | if (unlikely(region_index >= sctx->nr_regions)) { | |
397 | DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); | |
398 | return -EINVAL; | |
399 | } | |
400 | if (unlikely(path_nr >= sctx->nr_paths)) { | |
401 | DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); | |
402 | return -EINVAL; | |
403 | } | |
404 | ||
405 | switch_region_table_write(sctx, region_index, path_nr); | |
406 | } | |
407 | ||
408 | return 0; | |
409 | } | |
410 | ||
411 | /* | |
412 | * Messages are processed one-at-a-time. | |
413 | * | |
414 | * Only set_region_mappings is supported. | |
415 | */ | |
416 | static int switch_message(struct dm_target *ti, unsigned argc, char **argv) | |
417 | { | |
418 | static DEFINE_MUTEX(message_mutex); | |
419 | ||
420 | struct switch_ctx *sctx = ti->private; | |
421 | int r = -EINVAL; | |
422 | ||
423 | mutex_lock(&message_mutex); | |
424 | ||
425 | if (!strcasecmp(argv[0], "set_region_mappings")) | |
426 | r = process_set_region_mappings(sctx, argc, argv); | |
427 | else | |
428 | DMWARN("Unrecognised message received."); | |
429 | ||
430 | mutex_unlock(&message_mutex); | |
431 | ||
432 | return r; | |
433 | } | |
434 | ||
435 | static void switch_status(struct dm_target *ti, status_type_t type, | |
436 | unsigned status_flags, char *result, unsigned maxlen) | |
437 | { | |
438 | struct switch_ctx *sctx = ti->private; | |
439 | unsigned sz = 0; | |
440 | int path_nr; | |
441 | ||
442 | switch (type) { | |
443 | case STATUSTYPE_INFO: | |
444 | result[0] = '\0'; | |
445 | break; | |
446 | ||
447 | case STATUSTYPE_TABLE: | |
448 | DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); | |
449 | for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) | |
450 | DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, | |
451 | (unsigned long long)sctx->path_list[path_nr].start); | |
452 | break; | |
453 | } | |
454 | } | |
455 | ||
456 | /* | |
457 | * Switch ioctl: | |
458 | * | |
459 | * Passthrough all ioctls to the path for sector 0 | |
460 | */ | |
461 | static int switch_ioctl(struct dm_target *ti, unsigned cmd, | |
462 | unsigned long arg) | |
463 | { | |
464 | struct switch_ctx *sctx = ti->private; | |
465 | struct block_device *bdev; | |
466 | fmode_t mode; | |
467 | unsigned path_nr; | |
468 | int r = 0; | |
469 | ||
470 | path_nr = switch_get_path_nr(sctx, 0); | |
471 | ||
472 | bdev = sctx->path_list[path_nr].dmdev->bdev; | |
473 | mode = sctx->path_list[path_nr].dmdev->mode; | |
474 | ||
475 | /* | |
476 | * Only pass ioctls through if the device sizes match exactly. | |
477 | */ | |
478 | if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) | |
479 | r = scsi_verify_blk_ioctl(NULL, cmd); | |
480 | ||
481 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); | |
482 | } | |
483 | ||
484 | static int switch_iterate_devices(struct dm_target *ti, | |
485 | iterate_devices_callout_fn fn, void *data) | |
486 | { | |
487 | struct switch_ctx *sctx = ti->private; | |
488 | int path_nr; | |
489 | int r; | |
490 | ||
491 | for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { | |
492 | r = fn(ti, sctx->path_list[path_nr].dmdev, | |
493 | sctx->path_list[path_nr].start, ti->len, data); | |
494 | if (r) | |
495 | return r; | |
496 | } | |
497 | ||
498 | return 0; | |
499 | } | |
500 | ||
501 | static struct target_type switch_target = { | |
502 | .name = "switch", | |
503 | .version = {1, 0, 0}, | |
504 | .module = THIS_MODULE, | |
505 | .ctr = switch_ctr, | |
506 | .dtr = switch_dtr, | |
507 | .map = switch_map, | |
508 | .message = switch_message, | |
509 | .status = switch_status, | |
510 | .ioctl = switch_ioctl, | |
511 | .iterate_devices = switch_iterate_devices, | |
512 | }; | |
513 | ||
514 | static int __init dm_switch_init(void) | |
515 | { | |
516 | int r; | |
517 | ||
518 | r = dm_register_target(&switch_target); | |
519 | if (r < 0) | |
520 | DMERR("dm_register_target() failed %d", r); | |
521 | ||
522 | return r; | |
523 | } | |
524 | ||
525 | static void __exit dm_switch_exit(void) | |
526 | { | |
527 | dm_unregister_target(&switch_target); | |
528 | } | |
529 | ||
530 | module_init(dm_switch_init); | |
531 | module_exit(dm_switch_exit); | |
532 | ||
533 | MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); | |
534 | MODULE_AUTHOR("Kevin D. O'Kelley <[email protected]>"); | |
535 | MODULE_AUTHOR("Narendran Ganapathy <[email protected]>"); | |
536 | MODULE_AUTHOR("Jim Ramsay <[email protected]>"); | |
537 | MODULE_AUTHOR("Mikulas Patocka <[email protected]>"); | |
538 | MODULE_LICENSE("GPL"); |