]>
Commit | Line | Data |
---|---|---|
a1fec1db BH |
1 | /* |
2 | * Copyright (C) 2011 | |
aa281ac6 | 3 | * Boaz Harrosh <[email protected]> |
a1fec1db BH |
4 | * |
5 | * This file is part of the objects raid engine (ore). | |
6 | * | |
7 | * It is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 as published | |
9 | * by the Free Software Foundation. | |
10 | * | |
11 | * You should have received a copy of the GNU General Public License | |
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | |
13 | * "Free Software Foundation <[email protected]>" | |
14 | */ | |
15 | ||
16 | #include <linux/gfp.h> | |
769ba8d9 | 17 | #include <linux/async_tx.h> |
a1fec1db BH |
18 | |
19 | #include "ore_raid.h" | |
20 | ||
769ba8d9 BH |
21 | #undef ORE_DBGMSG2 |
22 | #define ORE_DBGMSG2 ORE_DBGMSG | |
23 | ||
0961f02a | 24 | static struct page *_raid_page_alloc(void) |
a1fec1db BH |
25 | { |
26 | return alloc_page(GFP_KERNEL); | |
27 | } | |
28 | ||
0961f02a | 29 | static void _raid_page_free(struct page *p) |
a1fec1db BH |
30 | { |
31 | __free_page(p); | |
32 | } | |
33 | ||
769ba8d9 BH |
34 | /* This struct is forward declare in ore_io_state, but is private to here. |
35 | * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. | |
36 | * | |
37 | * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. | |
38 | * Ascending page index access is sp2d(p-minor, c-major). But storage is | |
39 | * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor | |
40 | * API. | |
41 | */ | |
42 | struct __stripe_pages_2d { | |
43 | /* Cache some hot path repeated calculations */ | |
44 | unsigned parity; | |
45 | unsigned data_devs; | |
46 | unsigned pages_in_unit; | |
47 | ||
48 | bool needed ; | |
49 | ||
50 | /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ | |
51 | struct __1_page_stripe { | |
52 | bool alloc; | |
53 | unsigned write_count; | |
54 | struct async_submit_ctl submit; | |
55 | struct dma_async_tx_descriptor *tx; | |
56 | ||
57 | /* The size of this array is data_devs + parity */ | |
58 | struct page **pages; | |
59 | struct page **scribble; | |
60 | /* bool array, size of this array is data_devs */ | |
61 | char *page_is_read; | |
62 | } _1p_stripes[]; | |
63 | }; | |
64 | ||
65 | /* This can get bigger then a page. So support multiple page allocations | |
66 | * _sp2d_free should be called even if _sp2d_alloc fails (by returning | |
67 | * none-zero). | |
68 | */ | |
69 | static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, | |
70 | unsigned parity, struct __stripe_pages_2d **psp2d) | |
71 | { | |
72 | struct __stripe_pages_2d *sp2d; | |
73 | unsigned data_devs = group_width - parity; | |
74 | struct _alloc_all_bytes { | |
75 | struct __alloc_stripe_pages_2d { | |
76 | struct __stripe_pages_2d sp2d; | |
77 | struct __1_page_stripe _1p_stripes[pages_in_unit]; | |
78 | } __asp2d; | |
79 | struct __alloc_1p_arrays { | |
80 | struct page *pages[group_width]; | |
81 | struct page *scribble[group_width]; | |
82 | char page_is_read[data_devs]; | |
83 | } __a1pa[pages_in_unit]; | |
84 | } *_aab; | |
85 | struct __alloc_1p_arrays *__a1pa; | |
86 | struct __alloc_1p_arrays *__a1pa_end; | |
87 | const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); | |
88 | unsigned num_a1pa, alloc_size, i; | |
89 | ||
90 | /* FIXME: check these numbers in ore_verify_layout */ | |
91 | BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); | |
92 | BUG_ON(sizeof__a1pa > PAGE_SIZE); | |
93 | ||
94 | if (sizeof(*_aab) > PAGE_SIZE) { | |
95 | num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; | |
96 | alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; | |
97 | } else { | |
98 | num_a1pa = pages_in_unit; | |
99 | alloc_size = sizeof(*_aab); | |
100 | } | |
101 | ||
102 | _aab = kzalloc(alloc_size, GFP_KERNEL); | |
103 | if (unlikely(!_aab)) { | |
104 | ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); | |
105 | return -ENOMEM; | |
106 | } | |
107 | ||
108 | sp2d = &_aab->__asp2d.sp2d; | |
109 | *psp2d = sp2d; /* From here Just call _sp2d_free */ | |
110 | ||
111 | __a1pa = _aab->__a1pa; | |
112 | __a1pa_end = __a1pa + num_a1pa; | |
113 | ||
114 | for (i = 0; i < pages_in_unit; ++i) { | |
115 | if (unlikely(__a1pa >= __a1pa_end)) { | |
116 | num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, | |
117 | pages_in_unit - i); | |
118 | ||
b134079f | 119 | __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); |
769ba8d9 BH |
120 | if (unlikely(!__a1pa)) { |
121 | ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", | |
122 | num_a1pa); | |
123 | return -ENOMEM; | |
124 | } | |
125 | __a1pa_end = __a1pa + num_a1pa; | |
126 | /* First *pages is marked for kfree of the buffer */ | |
127 | sp2d->_1p_stripes[i].alloc = true; | |
128 | } | |
129 | ||
130 | sp2d->_1p_stripes[i].pages = __a1pa->pages; | |
131 | sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; | |
132 | sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; | |
133 | ++__a1pa; | |
134 | } | |
135 | ||
136 | sp2d->parity = parity; | |
137 | sp2d->data_devs = data_devs; | |
138 | sp2d->pages_in_unit = pages_in_unit; | |
139 | return 0; | |
140 | } | |
141 | ||
142 | static void _sp2d_reset(struct __stripe_pages_2d *sp2d, | |
143 | const struct _ore_r4w_op *r4w, void *priv) | |
144 | { | |
145 | unsigned data_devs = sp2d->data_devs; | |
146 | unsigned group_width = data_devs + sp2d->parity; | |
537632e0 | 147 | int p, c; |
769ba8d9 BH |
148 | |
149 | if (!sp2d->needed) | |
150 | return; | |
151 | ||
537632e0 BH |
152 | for (c = data_devs - 1; c >= 0; --c) |
153 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | |
154 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
769ba8d9 | 155 | |
537632e0 BH |
156 | if (_1ps->page_is_read[c]) { |
157 | struct page *page = _1ps->pages[c]; | |
769ba8d9 | 158 | |
537632e0 BH |
159 | r4w->put_page(priv, page); |
160 | _1ps->page_is_read[c] = false; | |
161 | } | |
769ba8d9 BH |
162 | } |
163 | ||
537632e0 BH |
164 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
165 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
166 | ||
769ba8d9 BH |
167 | memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); |
168 | _1ps->write_count = 0; | |
169 | _1ps->tx = NULL; | |
170 | } | |
171 | ||
172 | sp2d->needed = false; | |
173 | } | |
174 | ||
175 | static void _sp2d_free(struct __stripe_pages_2d *sp2d) | |
176 | { | |
177 | unsigned i; | |
178 | ||
179 | if (!sp2d) | |
180 | return; | |
181 | ||
182 | for (i = 0; i < sp2d->pages_in_unit; ++i) { | |
183 | if (sp2d->_1p_stripes[i].alloc) | |
184 | kfree(sp2d->_1p_stripes[i].pages); | |
185 | } | |
186 | ||
187 | kfree(sp2d); | |
188 | } | |
189 | ||
190 | static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) | |
191 | { | |
192 | unsigned p; | |
193 | ||
194 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
195 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
196 | ||
197 | if (_1ps->write_count) | |
198 | return p; | |
199 | } | |
200 | ||
201 | return ~0; | |
202 | } | |
203 | ||
204 | static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | |
205 | { | |
74b217d0 | 206 | int p; |
769ba8d9 BH |
207 | |
208 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | |
209 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
210 | ||
211 | if (_1ps->write_count) | |
212 | return p; | |
213 | } | |
214 | ||
215 | return ~0; | |
216 | } | |
217 | ||
218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | |
219 | { | |
220 | unsigned p; | |
ce5d36aa BH |
221 | unsigned tx_flags = ASYNC_TX_ACK; |
222 | ||
223 | if (sp2d->parity == 1) | |
224 | tx_flags |= ASYNC_TX_XOR_ZERO_DST; | |
225 | ||
769ba8d9 BH |
226 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
227 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
228 | ||
229 | if (!_1ps->write_count) | |
230 | continue; | |
231 | ||
ce5d36aa | 232 | init_async_submit(&_1ps->submit, tx_flags, |
101a6427 | 233 | NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); |
769ba8d9 | 234 | |
ce5d36aa BH |
235 | if (sp2d->parity == 1) |
236 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], | |
237 | _1ps->pages, 0, sp2d->data_devs, | |
238 | PAGE_SIZE, &_1ps->submit); | |
239 | else /* parity == 2 */ | |
240 | _1ps->tx = async_gen_syndrome(_1ps->pages, 0, | |
241 | sp2d->data_devs + sp2d->parity, | |
242 | PAGE_SIZE, &_1ps->submit); | |
769ba8d9 BH |
243 | } |
244 | ||
245 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
246 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
247 | /* NOTE: We wait for HW synchronously (I don't have such HW | |
248 | * to test with.) Is parallelism needed with today's multi | |
249 | * cores? | |
250 | */ | |
251 | async_tx_issue_pending(_1ps->tx); | |
252 | } | |
253 | } | |
254 | ||
255 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | |
256 | struct ore_striping_info *si, struct page *page) | |
257 | { | |
258 | struct __1_page_stripe *_1ps; | |
259 | ||
260 | sp2d->needed = true; | |
261 | ||
262 | _1ps = &sp2d->_1p_stripes[si->cur_pg]; | |
263 | _1ps->pages[si->cur_comp] = page; | |
264 | ++_1ps->write_count; | |
265 | ||
266 | si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; | |
267 | /* si->cur_comp is advanced outside at main loop */ | |
268 | } | |
269 | ||
a1fec1db BH |
270 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, |
271 | bool not_last) | |
272 | { | |
273 | struct osd_sg_entry *sge; | |
274 | ||
275 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | |
276 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | |
277 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | |
278 | _LLU(per_dev->offset), per_dev->length, | |
279 | per_dev->last_sgs_total); | |
280 | ||
281 | if (!per_dev->cur_sg) { | |
282 | sge = per_dev->sglist; | |
283 | ||
284 | /* First time we prepare two entries */ | |
285 | if (per_dev->length) { | |
286 | ++per_dev->cur_sg; | |
287 | sge->offset = per_dev->offset; | |
288 | sge->len = per_dev->length; | |
289 | } else { | |
290 | /* Here the parity is the first unit of this object. | |
291 | * This happens every time we reach a parity device on | |
292 | * the same stripe as the per_dev->offset. We need to | |
293 | * just skip this unit. | |
294 | */ | |
295 | per_dev->offset += cur_len; | |
296 | return; | |
297 | } | |
298 | } else { | |
299 | /* finalize the last one */ | |
300 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | |
301 | sge->len = per_dev->length - per_dev->last_sgs_total; | |
302 | } | |
303 | ||
304 | if (not_last) { | |
305 | /* Partly prepare the next one */ | |
306 | struct osd_sg_entry *next_sge = sge + 1; | |
307 | ||
308 | ++per_dev->cur_sg; | |
309 | next_sge->offset = sge->offset + sge->len + cur_len; | |
310 | /* Save cur len so we know how mutch was added next time */ | |
311 | per_dev->last_sgs_total = per_dev->length; | |
312 | next_sge->len = 0; | |
313 | } else if (!sge->len) { | |
314 | /* Optimize for when the last unit is a parity */ | |
315 | --per_dev->cur_sg; | |
316 | } | |
317 | } | |
318 | ||
769ba8d9 BH |
319 | static int _alloc_read_4_write(struct ore_io_state *ios) |
320 | { | |
321 | struct ore_layout *layout = ios->layout; | |
322 | int ret; | |
323 | /* We want to only read those pages not in cache so worst case | |
324 | * is a stripe populated with every other page | |
325 | */ | |
326 | unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; | |
327 | ||
328 | ret = _ore_get_io_state(layout, ios->oc, | |
329 | layout->group_width * layout->mirrors_p1, | |
330 | sgs_per_dev, 0, &ios->ios_read_4_write); | |
331 | return ret; | |
332 | } | |
333 | ||
334 | /* @si contains info of the to-be-inserted page. Update of @si should be | |
335 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | |
336 | */ | |
724577ca BH |
337 | static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, |
338 | struct page *page, unsigned pg_len) | |
769ba8d9 BH |
339 | { |
340 | struct request_queue *q; | |
341 | struct ore_per_dev_state *per_dev; | |
342 | struct ore_io_state *read_ios; | |
343 | unsigned first_dev = si->dev - (si->dev % | |
344 | (ios->layout->group_width * ios->layout->mirrors_p1)); | |
345 | unsigned comp = si->dev - first_dev; | |
346 | unsigned added_len; | |
347 | ||
348 | if (!ios->ios_read_4_write) { | |
349 | int ret = _alloc_read_4_write(ios); | |
350 | ||
351 | if (unlikely(ret)) | |
352 | return ret; | |
353 | } | |
354 | ||
355 | read_ios = ios->ios_read_4_write; | |
356 | read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; | |
357 | ||
358 | per_dev = &read_ios->per_dev[comp]; | |
359 | if (!per_dev->length) { | |
360 | per_dev->bio = bio_kmalloc(GFP_KERNEL, | |
361 | ios->sp2d->pages_in_unit); | |
362 | if (unlikely(!per_dev->bio)) { | |
363 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | |
364 | ios->sp2d->pages_in_unit); | |
365 | return -ENOMEM; | |
366 | } | |
367 | per_dev->offset = si->obj_offset; | |
368 | per_dev->dev = si->dev; | |
369 | } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { | |
370 | u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); | |
371 | ||
372 | _ore_add_sg_seg(per_dev, gap, true); | |
373 | } | |
374 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | |
724577ca BH |
375 | added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, |
376 | si->obj_offset % PAGE_SIZE); | |
377 | if (unlikely(added_len != pg_len)) { | |
769ba8d9 BH |
378 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", |
379 | per_dev->bio->bi_vcnt); | |
380 | return -ENOMEM; | |
381 | } | |
382 | ||
724577ca | 383 | per_dev->length += pg_len; |
769ba8d9 BH |
384 | return 0; |
385 | } | |
386 | ||
724577ca BH |
387 | /* read the beginning of an unaligned first page */ |
388 | static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) | |
389 | { | |
390 | struct ore_striping_info si; | |
391 | unsigned pg_len; | |
392 | ||
393 | ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); | |
394 | ||
395 | pg_len = si.obj_offset % PAGE_SIZE; | |
396 | si.obj_offset -= pg_len; | |
397 | ||
398 | ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", | |
399 | _LLU(si.obj_offset), pg_len, page->index, si.dev); | |
400 | ||
401 | return _add_to_r4w(ios, &si, page, pg_len); | |
402 | } | |
403 | ||
404 | /* read the end of an incomplete last page */ | |
405 | static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) | |
406 | { | |
407 | struct ore_striping_info si; | |
408 | struct page *page; | |
409 | unsigned pg_len, p, c; | |
410 | ||
411 | ore_calc_stripe_info(ios->layout, *offset, 0, &si); | |
412 | ||
455682ce BH |
413 | p = si.cur_pg; |
414 | c = si.cur_comp; | |
724577ca BH |
415 | page = ios->sp2d->_1p_stripes[p].pages[c]; |
416 | ||
417 | pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); | |
418 | *offset += pg_len; | |
419 | ||
420 | ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", | |
421 | p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); | |
422 | ||
423 | BUG_ON(!page); | |
424 | ||
425 | return _add_to_r4w(ios, &si, page, pg_len); | |
426 | } | |
427 | ||
769ba8d9 BH |
428 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) |
429 | { | |
430 | struct bio_vec *bv; | |
431 | unsigned i, d; | |
432 | ||
433 | /* loop on all devices all pages */ | |
434 | for (d = 0; d < ios->numdevs; d++) { | |
435 | struct bio *bio = ios->per_dev[d].bio; | |
436 | ||
437 | if (!bio) | |
438 | continue; | |
439 | ||
d74c6d51 | 440 | bio_for_each_segment_all(bv, bio, i) { |
769ba8d9 BH |
441 | struct page *page = bv->bv_page; |
442 | ||
443 | SetPageUptodate(page); | |
444 | if (PageError(page)) | |
445 | ClearPageError(page); | |
446 | } | |
447 | } | |
448 | } | |
449 | ||
450 | /* read_4_write is hacked to read the start of the first stripe and/or | |
451 | * the end of the last stripe. If needed, with an sg-gap at each device/page. | |
452 | * It is assumed to be called after the to_be_written pages of the first stripe | |
453 | * are populating ios->sp2d[][] | |
454 | * | |
455 | * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations | |
456 | * These pages are held at sp2d[p].pages[c] but with | |
457 | * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are | |
458 | * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is | |
459 | * @uptodate=true, so we don't need to read it, only unlock, after IO. | |
460 | * | |
461 | * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then | |
462 | * to-be-written count, we should consider the xor-in-place mode. | |
463 | * need_to_read_pages_count is the actual number of pages not present in cache. | |
464 | * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough | |
465 | * approximation? In this mode the read pages are put in the empty places of | |
466 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | |
467 | * allocated/freed and don't go through cache | |
468 | */ | |
9ff19309 | 469 | static int _read_4_write_first_stripe(struct ore_io_state *ios) |
769ba8d9 | 470 | { |
769ba8d9 BH |
471 | struct ore_striping_info read_si; |
472 | struct __stripe_pages_2d *sp2d = ios->sp2d; | |
473 | u64 offset = ios->si.first_stripe_start; | |
9ff19309 | 474 | unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; |
769ba8d9 BH |
475 | |
476 | if (offset == ios->offset) /* Go to start collect $200 */ | |
477 | goto read_last_stripe; | |
478 | ||
479 | min_p = _sp2d_min_pg(sp2d); | |
480 | max_p = _sp2d_max_pg(sp2d); | |
481 | ||
9ff19309 BH |
482 | ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", |
483 | offset, ios->offset, min_p, max_p); | |
484 | ||
769ba8d9 BH |
485 | for (c = 0; ; c++) { |
486 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
487 | read_si.obj_offset += min_p * PAGE_SIZE; | |
488 | offset += min_p * PAGE_SIZE; | |
489 | for (p = min_p; p <= max_p; p++) { | |
490 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
491 | struct page **pp = &_1ps->pages[c]; | |
492 | bool uptodate; | |
493 | ||
724577ca BH |
494 | if (*pp) { |
495 | if (ios->offset % PAGE_SIZE) | |
496 | /* Read the remainder of the page */ | |
497 | _add_to_r4w_first_page(ios, *pp); | |
769ba8d9 BH |
498 | /* to-be-written pages start here */ |
499 | goto read_last_stripe; | |
724577ca | 500 | } |
769ba8d9 BH |
501 | |
502 | *pp = ios->r4w->get_page(ios->private, offset, | |
503 | &uptodate); | |
504 | if (unlikely(!*pp)) | |
505 | return -ENOMEM; | |
506 | ||
507 | if (!uptodate) | |
724577ca | 508 | _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); |
769ba8d9 BH |
509 | |
510 | /* Mark read-pages to be cache_released */ | |
511 | _1ps->page_is_read[c] = true; | |
512 | read_si.obj_offset += PAGE_SIZE; | |
513 | offset += PAGE_SIZE; | |
514 | } | |
515 | offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; | |
516 | } | |
517 | ||
518 | read_last_stripe: | |
9ff19309 BH |
519 | return 0; |
520 | } | |
521 | ||
522 | static int _read_4_write_last_stripe(struct ore_io_state *ios) | |
523 | { | |
524 | struct ore_striping_info read_si; | |
525 | struct __stripe_pages_2d *sp2d = ios->sp2d; | |
526 | u64 offset; | |
527 | u64 last_stripe_end; | |
528 | unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | |
529 | unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; | |
530 | ||
724577ca BH |
531 | offset = ios->offset + ios->length; |
532 | if (offset % PAGE_SIZE) | |
533 | _add_to_r4w_last_page(ios, &offset); | |
534 | /* offset will be aligned to next page */ | |
535 | ||
769ba8d9 BH |
536 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) |
537 | * bytes_in_stripe; | |
538 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | |
539 | goto read_it; | |
540 | ||
541 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
455682ce BH |
542 | p = read_si.cur_pg; |
543 | c = read_si.cur_comp; | |
769ba8d9 | 544 | |
769ba8d9 BH |
545 | if (min_p == sp2d->pages_in_unit) { |
546 | /* Didn't do it yet */ | |
547 | min_p = _sp2d_min_pg(sp2d); | |
548 | max_p = _sp2d_max_pg(sp2d); | |
549 | } | |
550 | ||
9ff19309 BH |
551 | ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", |
552 | offset, last_stripe_end, min_p, max_p); | |
553 | ||
769ba8d9 BH |
554 | while (offset < last_stripe_end) { |
555 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
556 | ||
557 | if ((min_p <= p) && (p <= max_p)) { | |
558 | struct page *page; | |
559 | bool uptodate; | |
560 | ||
561 | BUG_ON(_1ps->pages[c]); | |
562 | page = ios->r4w->get_page(ios->private, offset, | |
563 | &uptodate); | |
564 | if (unlikely(!page)) | |
565 | return -ENOMEM; | |
566 | ||
567 | _1ps->pages[c] = page; | |
568 | /* Mark read-pages to be cache_released */ | |
569 | _1ps->page_is_read[c] = true; | |
570 | if (!uptodate) | |
724577ca | 571 | _add_to_r4w(ios, &read_si, page, PAGE_SIZE); |
769ba8d9 BH |
572 | } |
573 | ||
574 | offset += PAGE_SIZE; | |
575 | if (p == (sp2d->pages_in_unit - 1)) { | |
576 | ++c; | |
577 | p = 0; | |
578 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
579 | } else { | |
580 | read_si.obj_offset += PAGE_SIZE; | |
581 | ++p; | |
582 | } | |
583 | } | |
584 | ||
585 | read_it: | |
9ff19309 BH |
586 | return 0; |
587 | } | |
588 | ||
589 | static int _read_4_write_execute(struct ore_io_state *ios) | |
590 | { | |
591 | struct ore_io_state *ios_read; | |
592 | unsigned i; | |
593 | int ret; | |
594 | ||
769ba8d9 BH |
595 | ios_read = ios->ios_read_4_write; |
596 | if (!ios_read) | |
597 | return 0; | |
598 | ||
599 | /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change | |
600 | * to check for per_dev->bio | |
601 | */ | |
602 | ios_read->pages = ios->pages; | |
603 | ||
604 | /* Now read these devices */ | |
605 | for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { | |
606 | ret = _ore_read_mirror(ios_read, i); | |
607 | if (unlikely(ret)) | |
608 | return ret; | |
609 | } | |
610 | ||
611 | ret = ore_io_execute(ios_read); /* Synchronus execution */ | |
612 | if (unlikely(ret)) { | |
613 | ORE_DBGMSG("!! ore_io_execute => %d\n", ret); | |
614 | return ret; | |
615 | } | |
616 | ||
617 | _mark_read4write_pages_uptodate(ios_read, ret); | |
9ff19309 BH |
618 | ore_put_io_state(ios_read); |
619 | ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ | |
769ba8d9 BH |
620 | return 0; |
621 | } | |
622 | ||
a1fec1db BH |
623 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ |
624 | int _ore_add_parity_unit(struct ore_io_state *ios, | |
625 | struct ore_striping_info *si, | |
626 | struct ore_per_dev_state *per_dev, | |
ce5d36aa | 627 | unsigned cur_len, bool do_xor) |
a1fec1db BH |
628 | { |
629 | if (ios->reading) { | |
361aba56 BH |
630 | if (per_dev->cur_sg >= ios->sgs_per_dev) { |
631 | ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , | |
632 | per_dev->cur_sg, ios->sgs_per_dev); | |
633 | return -ENOMEM; | |
634 | } | |
a1fec1db BH |
635 | _ore_add_sg_seg(per_dev, cur_len, true); |
636 | } else { | |
769ba8d9 | 637 | struct __stripe_pages_2d *sp2d = ios->sp2d; |
a1fec1db | 638 | struct page **pages = ios->parity_pages + ios->cur_par_page; |
769ba8d9 | 639 | unsigned num_pages; |
a1fec1db BH |
640 | unsigned array_start = 0; |
641 | unsigned i; | |
642 | int ret; | |
643 | ||
769ba8d9 BH |
644 | si->cur_pg = _sp2d_min_pg(sp2d); |
645 | num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; | |
646 | ||
769ba8d9 BH |
647 | if (!per_dev->length) { |
648 | per_dev->offset += si->cur_pg * PAGE_SIZE; | |
649 | /* If first stripe, Read in all read4write pages | |
650 | * (if needed) before we calculate the first parity. | |
651 | */ | |
ce5d36aa BH |
652 | if (do_xor) |
653 | _read_4_write_first_stripe(ios); | |
769ba8d9 | 654 | } |
ce5d36aa BH |
655 | if (!cur_len && do_xor) |
656 | /* If last stripe r4w pages of last stripe */ | |
9ff19309 BH |
657 | _read_4_write_last_stripe(ios); |
658 | _read_4_write_execute(ios); | |
769ba8d9 | 659 | |
a1fec1db BH |
660 | for (i = 0; i < num_pages; i++) { |
661 | pages[i] = _raid_page_alloc(); | |
662 | if (unlikely(!pages[i])) | |
663 | return -ENOMEM; | |
664 | ||
665 | ++(ios->cur_par_page); | |
a1fec1db BH |
666 | } |
667 | ||
ce5d36aa | 668 | BUG_ON(si->cur_comp < sp2d->data_devs); |
769ba8d9 | 669 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); |
a1fec1db BH |
670 | |
671 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | |
672 | per_dev, num_pages * PAGE_SIZE); | |
673 | if (unlikely(ret)) | |
674 | return ret; | |
769ba8d9 | 675 | |
ce5d36aa BH |
676 | if (do_xor) { |
677 | _gen_xor_unit(sp2d); | |
678 | _sp2d_reset(sp2d, ios->r4w, ios->private); | |
679 | } | |
a1fec1db BH |
680 | } |
681 | return 0; | |
682 | } | |
683 | ||
684 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | |
685 | { | |
769ba8d9 | 686 | if (ios->parity_pages) { |
9ff19309 | 687 | struct ore_layout *layout = ios->layout; |
769ba8d9 | 688 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; |
769ba8d9 BH |
689 | |
690 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | |
691 | layout->parity, &ios->sp2d)) { | |
692 | return -ENOMEM; | |
693 | } | |
769ba8d9 | 694 | } |
a1fec1db BH |
695 | return 0; |
696 | } | |
697 | ||
698 | void _ore_free_raid_stuff(struct ore_io_state *ios) | |
699 | { | |
769ba8d9 | 700 | if (ios->sp2d) { /* writing and raid */ |
a1fec1db BH |
701 | unsigned i; |
702 | ||
703 | for (i = 0; i < ios->cur_par_page; i++) { | |
704 | struct page *page = ios->parity_pages[i]; | |
705 | ||
706 | if (page) | |
707 | _raid_page_free(page); | |
708 | } | |
709 | if (ios->extra_part_alloc) | |
710 | kfree(ios->parity_pages); | |
769ba8d9 BH |
711 | /* If IO returned an error pages might need unlocking */ |
712 | _sp2d_reset(ios->sp2d, ios->r4w, ios->private); | |
713 | _sp2d_free(ios->sp2d); | |
a1fec1db BH |
714 | } else { |
715 | /* Will only be set if raid reading && sglist is big */ | |
716 | if (ios->extra_part_alloc) | |
717 | kfree(ios->per_dev[0].sglist); | |
718 | } | |
769ba8d9 BH |
719 | if (ios->ios_read_4_write) |
720 | ore_put_io_state(ios->ios_read_4_write); | |
a1fec1db | 721 | } |