]>
Commit | Line | Data |
---|---|---|
a1fec1db BH |
1 | /* |
2 | * Copyright (C) 2011 | |
aa281ac6 | 3 | * Boaz Harrosh <[email protected]> |
a1fec1db BH |
4 | * |
5 | * This file is part of the objects raid engine (ore). | |
6 | * | |
7 | * It is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 as published | |
9 | * by the Free Software Foundation. | |
10 | * | |
11 | * You should have received a copy of the GNU General Public License | |
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | |
13 | * "Free Software Foundation <[email protected]>" | |
14 | */ | |
15 | ||
16 | #include <linux/gfp.h> | |
769ba8d9 | 17 | #include <linux/async_tx.h> |
a1fec1db BH |
18 | |
19 | #include "ore_raid.h" | |
20 | ||
769ba8d9 BH |
21 | #undef ORE_DBGMSG2 |
22 | #define ORE_DBGMSG2 ORE_DBGMSG | |
23 | ||
0961f02a | 24 | static struct page *_raid_page_alloc(void) |
a1fec1db BH |
25 | { |
26 | return alloc_page(GFP_KERNEL); | |
27 | } | |
28 | ||
0961f02a | 29 | static void _raid_page_free(struct page *p) |
a1fec1db BH |
30 | { |
31 | __free_page(p); | |
32 | } | |
33 | ||
769ba8d9 BH |
34 | /* This struct is forward declare in ore_io_state, but is private to here. |
35 | * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. | |
36 | * | |
37 | * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. | |
38 | * Ascending page index access is sp2d(p-minor, c-major). But storage is | |
39 | * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor | |
40 | * API. | |
41 | */ | |
42 | struct __stripe_pages_2d { | |
43 | /* Cache some hot path repeated calculations */ | |
44 | unsigned parity; | |
45 | unsigned data_devs; | |
46 | unsigned pages_in_unit; | |
47 | ||
48 | bool needed ; | |
49 | ||
50 | /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ | |
51 | struct __1_page_stripe { | |
52 | bool alloc; | |
53 | unsigned write_count; | |
54 | struct async_submit_ctl submit; | |
55 | struct dma_async_tx_descriptor *tx; | |
56 | ||
57 | /* The size of this array is data_devs + parity */ | |
58 | struct page **pages; | |
59 | struct page **scribble; | |
60 | /* bool array, size of this array is data_devs */ | |
61 | char *page_is_read; | |
62 | } _1p_stripes[]; | |
63 | }; | |
64 | ||
65 | /* This can get bigger then a page. So support multiple page allocations | |
66 | * _sp2d_free should be called even if _sp2d_alloc fails (by returning | |
67 | * none-zero). | |
68 | */ | |
69 | static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, | |
70 | unsigned parity, struct __stripe_pages_2d **psp2d) | |
71 | { | |
72 | struct __stripe_pages_2d *sp2d; | |
73 | unsigned data_devs = group_width - parity; | |
20fe9353 KC |
74 | |
75 | /* | |
76 | * Desired allocation layout is, though when larger than PAGE_SIZE, | |
77 | * each struct __alloc_1p_arrays is separately allocated: | |
78 | ||
769ba8d9 BH |
79 | struct _alloc_all_bytes { |
80 | struct __alloc_stripe_pages_2d { | |
81 | struct __stripe_pages_2d sp2d; | |
82 | struct __1_page_stripe _1p_stripes[pages_in_unit]; | |
83 | } __asp2d; | |
84 | struct __alloc_1p_arrays { | |
85 | struct page *pages[group_width]; | |
86 | struct page *scribble[group_width]; | |
87 | char page_is_read[data_devs]; | |
88 | } __a1pa[pages_in_unit]; | |
89 | } *_aab; | |
20fe9353 | 90 | |
769ba8d9 BH |
91 | struct __alloc_1p_arrays *__a1pa; |
92 | struct __alloc_1p_arrays *__a1pa_end; | |
20fe9353 KC |
93 | |
94 | */ | |
95 | ||
96 | char *__a1pa; | |
97 | char *__a1pa_end; | |
98 | ||
99 | const size_t sizeof_stripe_pages_2d = | |
100 | sizeof(struct __stripe_pages_2d) + | |
101 | sizeof(struct __1_page_stripe) * pages_in_unit; | |
102 | const size_t sizeof__a1pa = | |
103 | ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs, | |
104 | sizeof(void *)); | |
105 | const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit; | |
106 | const size_t alloc_total = sizeof_stripe_pages_2d + | |
107 | sizeof__a1pa_arrays; | |
108 | ||
769ba8d9 BH |
109 | unsigned num_a1pa, alloc_size, i; |
110 | ||
111 | /* FIXME: check these numbers in ore_verify_layout */ | |
20fe9353 | 112 | BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE); |
769ba8d9 BH |
113 | BUG_ON(sizeof__a1pa > PAGE_SIZE); |
114 | ||
20fe9353 KC |
115 | /* |
116 | * If alloc_total would be larger than PAGE_SIZE, only allocate | |
117 | * as many a1pa items as would fill the rest of the page, instead | |
118 | * of the full pages_in_unit count. | |
119 | */ | |
120 | if (alloc_total > PAGE_SIZE) { | |
121 | num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa; | |
122 | alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa; | |
769ba8d9 BH |
123 | } else { |
124 | num_a1pa = pages_in_unit; | |
20fe9353 | 125 | alloc_size = alloc_total; |
769ba8d9 BH |
126 | } |
127 | ||
20fe9353 KC |
128 | *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL); |
129 | if (unlikely(!sp2d)) { | |
769ba8d9 BH |
130 | ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); |
131 | return -ENOMEM; | |
132 | } | |
20fe9353 | 133 | /* From here Just call _sp2d_free */ |
769ba8d9 | 134 | |
20fe9353 KC |
135 | /* Find start of a1pa area. */ |
136 | __a1pa = (char *)sp2d + sizeof_stripe_pages_2d; | |
137 | /* Find end of the _allocated_ a1pa area. */ | |
138 | __a1pa_end = __a1pa + alloc_size; | |
769ba8d9 | 139 | |
20fe9353 | 140 | /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */ |
769ba8d9 | 141 | for (i = 0; i < pages_in_unit; ++i) { |
20fe9353 KC |
142 | struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i]; |
143 | ||
769ba8d9 BH |
144 | if (unlikely(__a1pa >= __a1pa_end)) { |
145 | num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, | |
146 | pages_in_unit - i); | |
20fe9353 | 147 | alloc_size = sizeof__a1pa * num_a1pa; |
769ba8d9 | 148 | |
20fe9353 | 149 | __a1pa = kzalloc(alloc_size, GFP_KERNEL); |
769ba8d9 BH |
150 | if (unlikely(!__a1pa)) { |
151 | ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", | |
152 | num_a1pa); | |
153 | return -ENOMEM; | |
154 | } | |
20fe9353 | 155 | __a1pa_end = __a1pa + alloc_size; |
769ba8d9 | 156 | /* First *pages is marked for kfree of the buffer */ |
20fe9353 | 157 | stripe->alloc = true; |
769ba8d9 BH |
158 | } |
159 | ||
20fe9353 KC |
160 | /* |
161 | * Attach all _lp_stripes pointers to the allocation for | |
162 | * it which was either part of the original PAGE_SIZE | |
163 | * allocation or the subsequent allocation in this loop. | |
164 | */ | |
165 | stripe->pages = (void *)__a1pa; | |
166 | stripe->scribble = stripe->pages + group_width; | |
167 | stripe->page_is_read = (char *)stripe->scribble + group_width; | |
168 | __a1pa += sizeof__a1pa; | |
769ba8d9 BH |
169 | } |
170 | ||
171 | sp2d->parity = parity; | |
172 | sp2d->data_devs = data_devs; | |
173 | sp2d->pages_in_unit = pages_in_unit; | |
174 | return 0; | |
175 | } | |
176 | ||
177 | static void _sp2d_reset(struct __stripe_pages_2d *sp2d, | |
178 | const struct _ore_r4w_op *r4w, void *priv) | |
179 | { | |
180 | unsigned data_devs = sp2d->data_devs; | |
181 | unsigned group_width = data_devs + sp2d->parity; | |
537632e0 | 182 | int p, c; |
769ba8d9 BH |
183 | |
184 | if (!sp2d->needed) | |
185 | return; | |
186 | ||
537632e0 BH |
187 | for (c = data_devs - 1; c >= 0; --c) |
188 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | |
189 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
769ba8d9 | 190 | |
537632e0 BH |
191 | if (_1ps->page_is_read[c]) { |
192 | struct page *page = _1ps->pages[c]; | |
769ba8d9 | 193 | |
537632e0 BH |
194 | r4w->put_page(priv, page); |
195 | _1ps->page_is_read[c] = false; | |
196 | } | |
769ba8d9 BH |
197 | } |
198 | ||
537632e0 BH |
199 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
200 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
201 | ||
769ba8d9 BH |
202 | memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); |
203 | _1ps->write_count = 0; | |
204 | _1ps->tx = NULL; | |
205 | } | |
206 | ||
207 | sp2d->needed = false; | |
208 | } | |
209 | ||
210 | static void _sp2d_free(struct __stripe_pages_2d *sp2d) | |
211 | { | |
212 | unsigned i; | |
213 | ||
214 | if (!sp2d) | |
215 | return; | |
216 | ||
217 | for (i = 0; i < sp2d->pages_in_unit; ++i) { | |
218 | if (sp2d->_1p_stripes[i].alloc) | |
219 | kfree(sp2d->_1p_stripes[i].pages); | |
220 | } | |
221 | ||
222 | kfree(sp2d); | |
223 | } | |
224 | ||
225 | static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) | |
226 | { | |
227 | unsigned p; | |
228 | ||
229 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
230 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
231 | ||
232 | if (_1ps->write_count) | |
233 | return p; | |
234 | } | |
235 | ||
236 | return ~0; | |
237 | } | |
238 | ||
239 | static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | |
240 | { | |
74b217d0 | 241 | int p; |
769ba8d9 BH |
242 | |
243 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | |
244 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
245 | ||
246 | if (_1ps->write_count) | |
247 | return p; | |
248 | } | |
249 | ||
250 | return ~0; | |
251 | } | |
252 | ||
253 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | |
254 | { | |
255 | unsigned p; | |
ce5d36aa BH |
256 | unsigned tx_flags = ASYNC_TX_ACK; |
257 | ||
258 | if (sp2d->parity == 1) | |
259 | tx_flags |= ASYNC_TX_XOR_ZERO_DST; | |
260 | ||
769ba8d9 BH |
261 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
262 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
263 | ||
264 | if (!_1ps->write_count) | |
265 | continue; | |
266 | ||
ce5d36aa | 267 | init_async_submit(&_1ps->submit, tx_flags, |
101a6427 | 268 | NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); |
769ba8d9 | 269 | |
ce5d36aa BH |
270 | if (sp2d->parity == 1) |
271 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], | |
272 | _1ps->pages, 0, sp2d->data_devs, | |
273 | PAGE_SIZE, &_1ps->submit); | |
274 | else /* parity == 2 */ | |
275 | _1ps->tx = async_gen_syndrome(_1ps->pages, 0, | |
276 | sp2d->data_devs + sp2d->parity, | |
277 | PAGE_SIZE, &_1ps->submit); | |
769ba8d9 BH |
278 | } |
279 | ||
280 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
281 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
282 | /* NOTE: We wait for HW synchronously (I don't have such HW | |
283 | * to test with.) Is parallelism needed with today's multi | |
284 | * cores? | |
285 | */ | |
286 | async_tx_issue_pending(_1ps->tx); | |
287 | } | |
288 | } | |
289 | ||
290 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | |
291 | struct ore_striping_info *si, struct page *page) | |
292 | { | |
293 | struct __1_page_stripe *_1ps; | |
294 | ||
295 | sp2d->needed = true; | |
296 | ||
297 | _1ps = &sp2d->_1p_stripes[si->cur_pg]; | |
298 | _1ps->pages[si->cur_comp] = page; | |
299 | ++_1ps->write_count; | |
300 | ||
301 | si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; | |
302 | /* si->cur_comp is advanced outside at main loop */ | |
303 | } | |
304 | ||
a1fec1db BH |
305 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, |
306 | bool not_last) | |
307 | { | |
308 | struct osd_sg_entry *sge; | |
309 | ||
310 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | |
311 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | |
312 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | |
313 | _LLU(per_dev->offset), per_dev->length, | |
314 | per_dev->last_sgs_total); | |
315 | ||
316 | if (!per_dev->cur_sg) { | |
317 | sge = per_dev->sglist; | |
318 | ||
319 | /* First time we prepare two entries */ | |
320 | if (per_dev->length) { | |
321 | ++per_dev->cur_sg; | |
322 | sge->offset = per_dev->offset; | |
323 | sge->len = per_dev->length; | |
324 | } else { | |
325 | /* Here the parity is the first unit of this object. | |
326 | * This happens every time we reach a parity device on | |
327 | * the same stripe as the per_dev->offset. We need to | |
328 | * just skip this unit. | |
329 | */ | |
330 | per_dev->offset += cur_len; | |
331 | return; | |
332 | } | |
333 | } else { | |
334 | /* finalize the last one */ | |
335 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | |
336 | sge->len = per_dev->length - per_dev->last_sgs_total; | |
337 | } | |
338 | ||
339 | if (not_last) { | |
340 | /* Partly prepare the next one */ | |
341 | struct osd_sg_entry *next_sge = sge + 1; | |
342 | ||
343 | ++per_dev->cur_sg; | |
344 | next_sge->offset = sge->offset + sge->len + cur_len; | |
345 | /* Save cur len so we know how mutch was added next time */ | |
346 | per_dev->last_sgs_total = per_dev->length; | |
347 | next_sge->len = 0; | |
348 | } else if (!sge->len) { | |
349 | /* Optimize for when the last unit is a parity */ | |
350 | --per_dev->cur_sg; | |
351 | } | |
352 | } | |
353 | ||
769ba8d9 BH |
354 | static int _alloc_read_4_write(struct ore_io_state *ios) |
355 | { | |
356 | struct ore_layout *layout = ios->layout; | |
357 | int ret; | |
358 | /* We want to only read those pages not in cache so worst case | |
359 | * is a stripe populated with every other page | |
360 | */ | |
361 | unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; | |
362 | ||
363 | ret = _ore_get_io_state(layout, ios->oc, | |
364 | layout->group_width * layout->mirrors_p1, | |
365 | sgs_per_dev, 0, &ios->ios_read_4_write); | |
366 | return ret; | |
367 | } | |
368 | ||
369 | /* @si contains info of the to-be-inserted page. Update of @si should be | |
370 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | |
371 | */ | |
724577ca BH |
372 | static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, |
373 | struct page *page, unsigned pg_len) | |
769ba8d9 BH |
374 | { |
375 | struct request_queue *q; | |
376 | struct ore_per_dev_state *per_dev; | |
377 | struct ore_io_state *read_ios; | |
378 | unsigned first_dev = si->dev - (si->dev % | |
379 | (ios->layout->group_width * ios->layout->mirrors_p1)); | |
380 | unsigned comp = si->dev - first_dev; | |
381 | unsigned added_len; | |
382 | ||
383 | if (!ios->ios_read_4_write) { | |
384 | int ret = _alloc_read_4_write(ios); | |
385 | ||
386 | if (unlikely(ret)) | |
387 | return ret; | |
388 | } | |
389 | ||
390 | read_ios = ios->ios_read_4_write; | |
391 | read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; | |
392 | ||
393 | per_dev = &read_ios->per_dev[comp]; | |
394 | if (!per_dev->length) { | |
395 | per_dev->bio = bio_kmalloc(GFP_KERNEL, | |
396 | ios->sp2d->pages_in_unit); | |
397 | if (unlikely(!per_dev->bio)) { | |
398 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | |
399 | ios->sp2d->pages_in_unit); | |
400 | return -ENOMEM; | |
401 | } | |
402 | per_dev->offset = si->obj_offset; | |
403 | per_dev->dev = si->dev; | |
404 | } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { | |
405 | u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); | |
406 | ||
407 | _ore_add_sg_seg(per_dev, gap, true); | |
408 | } | |
409 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | |
724577ca BH |
410 | added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, |
411 | si->obj_offset % PAGE_SIZE); | |
412 | if (unlikely(added_len != pg_len)) { | |
769ba8d9 BH |
413 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", |
414 | per_dev->bio->bi_vcnt); | |
415 | return -ENOMEM; | |
416 | } | |
417 | ||
724577ca | 418 | per_dev->length += pg_len; |
769ba8d9 BH |
419 | return 0; |
420 | } | |
421 | ||
724577ca BH |
422 | /* read the beginning of an unaligned first page */ |
423 | static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) | |
424 | { | |
425 | struct ore_striping_info si; | |
426 | unsigned pg_len; | |
427 | ||
428 | ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); | |
429 | ||
430 | pg_len = si.obj_offset % PAGE_SIZE; | |
431 | si.obj_offset -= pg_len; | |
432 | ||
433 | ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", | |
434 | _LLU(si.obj_offset), pg_len, page->index, si.dev); | |
435 | ||
436 | return _add_to_r4w(ios, &si, page, pg_len); | |
437 | } | |
438 | ||
439 | /* read the end of an incomplete last page */ | |
440 | static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) | |
441 | { | |
442 | struct ore_striping_info si; | |
443 | struct page *page; | |
444 | unsigned pg_len, p, c; | |
445 | ||
446 | ore_calc_stripe_info(ios->layout, *offset, 0, &si); | |
447 | ||
455682ce BH |
448 | p = si.cur_pg; |
449 | c = si.cur_comp; | |
724577ca BH |
450 | page = ios->sp2d->_1p_stripes[p].pages[c]; |
451 | ||
452 | pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); | |
453 | *offset += pg_len; | |
454 | ||
455 | ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", | |
456 | p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); | |
457 | ||
458 | BUG_ON(!page); | |
459 | ||
460 | return _add_to_r4w(ios, &si, page, pg_len); | |
461 | } | |
462 | ||
769ba8d9 BH |
463 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) |
464 | { | |
465 | struct bio_vec *bv; | |
466 | unsigned i, d; | |
467 | ||
468 | /* loop on all devices all pages */ | |
469 | for (d = 0; d < ios->numdevs; d++) { | |
470 | struct bio *bio = ios->per_dev[d].bio; | |
471 | ||
472 | if (!bio) | |
473 | continue; | |
474 | ||
d74c6d51 | 475 | bio_for_each_segment_all(bv, bio, i) { |
769ba8d9 BH |
476 | struct page *page = bv->bv_page; |
477 | ||
478 | SetPageUptodate(page); | |
479 | if (PageError(page)) | |
480 | ClearPageError(page); | |
481 | } | |
482 | } | |
483 | } | |
484 | ||
485 | /* read_4_write is hacked to read the start of the first stripe and/or | |
486 | * the end of the last stripe. If needed, with an sg-gap at each device/page. | |
487 | * It is assumed to be called after the to_be_written pages of the first stripe | |
488 | * are populating ios->sp2d[][] | |
489 | * | |
490 | * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations | |
491 | * These pages are held at sp2d[p].pages[c] but with | |
492 | * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are | |
493 | * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is | |
494 | * @uptodate=true, so we don't need to read it, only unlock, after IO. | |
495 | * | |
496 | * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then | |
497 | * to-be-written count, we should consider the xor-in-place mode. | |
498 | * need_to_read_pages_count is the actual number of pages not present in cache. | |
499 | * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough | |
500 | * approximation? In this mode the read pages are put in the empty places of | |
501 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | |
502 | * allocated/freed and don't go through cache | |
503 | */ | |
9ff19309 | 504 | static int _read_4_write_first_stripe(struct ore_io_state *ios) |
769ba8d9 | 505 | { |
769ba8d9 BH |
506 | struct ore_striping_info read_si; |
507 | struct __stripe_pages_2d *sp2d = ios->sp2d; | |
508 | u64 offset = ios->si.first_stripe_start; | |
9ff19309 | 509 | unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; |
769ba8d9 BH |
510 | |
511 | if (offset == ios->offset) /* Go to start collect $200 */ | |
512 | goto read_last_stripe; | |
513 | ||
514 | min_p = _sp2d_min_pg(sp2d); | |
515 | max_p = _sp2d_max_pg(sp2d); | |
516 | ||
9ff19309 BH |
517 | ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", |
518 | offset, ios->offset, min_p, max_p); | |
519 | ||
769ba8d9 BH |
520 | for (c = 0; ; c++) { |
521 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
522 | read_si.obj_offset += min_p * PAGE_SIZE; | |
523 | offset += min_p * PAGE_SIZE; | |
524 | for (p = min_p; p <= max_p; p++) { | |
525 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
526 | struct page **pp = &_1ps->pages[c]; | |
527 | bool uptodate; | |
528 | ||
724577ca BH |
529 | if (*pp) { |
530 | if (ios->offset % PAGE_SIZE) | |
531 | /* Read the remainder of the page */ | |
532 | _add_to_r4w_first_page(ios, *pp); | |
769ba8d9 BH |
533 | /* to-be-written pages start here */ |
534 | goto read_last_stripe; | |
724577ca | 535 | } |
769ba8d9 BH |
536 | |
537 | *pp = ios->r4w->get_page(ios->private, offset, | |
538 | &uptodate); | |
539 | if (unlikely(!*pp)) | |
540 | return -ENOMEM; | |
541 | ||
542 | if (!uptodate) | |
724577ca | 543 | _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); |
769ba8d9 BH |
544 | |
545 | /* Mark read-pages to be cache_released */ | |
546 | _1ps->page_is_read[c] = true; | |
547 | read_si.obj_offset += PAGE_SIZE; | |
548 | offset += PAGE_SIZE; | |
549 | } | |
550 | offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; | |
551 | } | |
552 | ||
553 | read_last_stripe: | |
9ff19309 BH |
554 | return 0; |
555 | } | |
556 | ||
557 | static int _read_4_write_last_stripe(struct ore_io_state *ios) | |
558 | { | |
559 | struct ore_striping_info read_si; | |
560 | struct __stripe_pages_2d *sp2d = ios->sp2d; | |
561 | u64 offset; | |
562 | u64 last_stripe_end; | |
563 | unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | |
564 | unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; | |
565 | ||
724577ca BH |
566 | offset = ios->offset + ios->length; |
567 | if (offset % PAGE_SIZE) | |
568 | _add_to_r4w_last_page(ios, &offset); | |
569 | /* offset will be aligned to next page */ | |
570 | ||
769ba8d9 BH |
571 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) |
572 | * bytes_in_stripe; | |
573 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | |
574 | goto read_it; | |
575 | ||
576 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
455682ce BH |
577 | p = read_si.cur_pg; |
578 | c = read_si.cur_comp; | |
769ba8d9 | 579 | |
769ba8d9 BH |
580 | if (min_p == sp2d->pages_in_unit) { |
581 | /* Didn't do it yet */ | |
582 | min_p = _sp2d_min_pg(sp2d); | |
583 | max_p = _sp2d_max_pg(sp2d); | |
584 | } | |
585 | ||
9ff19309 BH |
586 | ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", |
587 | offset, last_stripe_end, min_p, max_p); | |
588 | ||
769ba8d9 BH |
589 | while (offset < last_stripe_end) { |
590 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
591 | ||
592 | if ((min_p <= p) && (p <= max_p)) { | |
593 | struct page *page; | |
594 | bool uptodate; | |
595 | ||
596 | BUG_ON(_1ps->pages[c]); | |
597 | page = ios->r4w->get_page(ios->private, offset, | |
598 | &uptodate); | |
599 | if (unlikely(!page)) | |
600 | return -ENOMEM; | |
601 | ||
602 | _1ps->pages[c] = page; | |
603 | /* Mark read-pages to be cache_released */ | |
604 | _1ps->page_is_read[c] = true; | |
605 | if (!uptodate) | |
724577ca | 606 | _add_to_r4w(ios, &read_si, page, PAGE_SIZE); |
769ba8d9 BH |
607 | } |
608 | ||
609 | offset += PAGE_SIZE; | |
610 | if (p == (sp2d->pages_in_unit - 1)) { | |
611 | ++c; | |
612 | p = 0; | |
613 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
614 | } else { | |
615 | read_si.obj_offset += PAGE_SIZE; | |
616 | ++p; | |
617 | } | |
618 | } | |
619 | ||
620 | read_it: | |
9ff19309 BH |
621 | return 0; |
622 | } | |
623 | ||
624 | static int _read_4_write_execute(struct ore_io_state *ios) | |
625 | { | |
626 | struct ore_io_state *ios_read; | |
627 | unsigned i; | |
628 | int ret; | |
629 | ||
769ba8d9 BH |
630 | ios_read = ios->ios_read_4_write; |
631 | if (!ios_read) | |
632 | return 0; | |
633 | ||
634 | /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change | |
635 | * to check for per_dev->bio | |
636 | */ | |
637 | ios_read->pages = ios->pages; | |
638 | ||
639 | /* Now read these devices */ | |
640 | for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { | |
641 | ret = _ore_read_mirror(ios_read, i); | |
642 | if (unlikely(ret)) | |
643 | return ret; | |
644 | } | |
645 | ||
646 | ret = ore_io_execute(ios_read); /* Synchronus execution */ | |
647 | if (unlikely(ret)) { | |
648 | ORE_DBGMSG("!! ore_io_execute => %d\n", ret); | |
649 | return ret; | |
650 | } | |
651 | ||
652 | _mark_read4write_pages_uptodate(ios_read, ret); | |
9ff19309 BH |
653 | ore_put_io_state(ios_read); |
654 | ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ | |
769ba8d9 BH |
655 | return 0; |
656 | } | |
657 | ||
a1fec1db BH |
658 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ |
659 | int _ore_add_parity_unit(struct ore_io_state *ios, | |
660 | struct ore_striping_info *si, | |
661 | struct ore_per_dev_state *per_dev, | |
ce5d36aa | 662 | unsigned cur_len, bool do_xor) |
a1fec1db BH |
663 | { |
664 | if (ios->reading) { | |
361aba56 BH |
665 | if (per_dev->cur_sg >= ios->sgs_per_dev) { |
666 | ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , | |
667 | per_dev->cur_sg, ios->sgs_per_dev); | |
668 | return -ENOMEM; | |
669 | } | |
a1fec1db BH |
670 | _ore_add_sg_seg(per_dev, cur_len, true); |
671 | } else { | |
769ba8d9 | 672 | struct __stripe_pages_2d *sp2d = ios->sp2d; |
a1fec1db | 673 | struct page **pages = ios->parity_pages + ios->cur_par_page; |
769ba8d9 | 674 | unsigned num_pages; |
a1fec1db BH |
675 | unsigned array_start = 0; |
676 | unsigned i; | |
677 | int ret; | |
678 | ||
769ba8d9 BH |
679 | si->cur_pg = _sp2d_min_pg(sp2d); |
680 | num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; | |
681 | ||
769ba8d9 BH |
682 | if (!per_dev->length) { |
683 | per_dev->offset += si->cur_pg * PAGE_SIZE; | |
684 | /* If first stripe, Read in all read4write pages | |
685 | * (if needed) before we calculate the first parity. | |
686 | */ | |
ce5d36aa BH |
687 | if (do_xor) |
688 | _read_4_write_first_stripe(ios); | |
769ba8d9 | 689 | } |
ce5d36aa BH |
690 | if (!cur_len && do_xor) |
691 | /* If last stripe r4w pages of last stripe */ | |
9ff19309 BH |
692 | _read_4_write_last_stripe(ios); |
693 | _read_4_write_execute(ios); | |
769ba8d9 | 694 | |
a1fec1db BH |
695 | for (i = 0; i < num_pages; i++) { |
696 | pages[i] = _raid_page_alloc(); | |
697 | if (unlikely(!pages[i])) | |
698 | return -ENOMEM; | |
699 | ||
700 | ++(ios->cur_par_page); | |
a1fec1db BH |
701 | } |
702 | ||
ce5d36aa | 703 | BUG_ON(si->cur_comp < sp2d->data_devs); |
769ba8d9 | 704 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); |
a1fec1db BH |
705 | |
706 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | |
707 | per_dev, num_pages * PAGE_SIZE); | |
708 | if (unlikely(ret)) | |
709 | return ret; | |
769ba8d9 | 710 | |
ce5d36aa BH |
711 | if (do_xor) { |
712 | _gen_xor_unit(sp2d); | |
713 | _sp2d_reset(sp2d, ios->r4w, ios->private); | |
714 | } | |
a1fec1db BH |
715 | } |
716 | return 0; | |
717 | } | |
718 | ||
719 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | |
720 | { | |
769ba8d9 | 721 | if (ios->parity_pages) { |
9ff19309 | 722 | struct ore_layout *layout = ios->layout; |
769ba8d9 | 723 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; |
769ba8d9 BH |
724 | |
725 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | |
726 | layout->parity, &ios->sp2d)) { | |
727 | return -ENOMEM; | |
728 | } | |
769ba8d9 | 729 | } |
a1fec1db BH |
730 | return 0; |
731 | } | |
732 | ||
733 | void _ore_free_raid_stuff(struct ore_io_state *ios) | |
734 | { | |
769ba8d9 | 735 | if (ios->sp2d) { /* writing and raid */ |
a1fec1db BH |
736 | unsigned i; |
737 | ||
738 | for (i = 0; i < ios->cur_par_page; i++) { | |
739 | struct page *page = ios->parity_pages[i]; | |
740 | ||
741 | if (page) | |
742 | _raid_page_free(page); | |
743 | } | |
744 | if (ios->extra_part_alloc) | |
745 | kfree(ios->parity_pages); | |
769ba8d9 BH |
746 | /* If IO returned an error pages might need unlocking */ |
747 | _sp2d_reset(ios->sp2d, ios->r4w, ios->private); | |
748 | _sp2d_free(ios->sp2d); | |
a1fec1db BH |
749 | } else { |
750 | /* Will only be set if raid reading && sglist is big */ | |
751 | if (ios->extra_part_alloc) | |
752 | kfree(ios->per_dev[0].sglist); | |
753 | } | |
769ba8d9 BH |
754 | if (ios->ios_read_4_write) |
755 | ore_put_io_state(ios->ios_read_4_write); | |
a1fec1db | 756 | } |