]>
Commit | Line | Data |
---|---|---|
274dcf55 MM |
1 | /* |
2 | * (C) 2001 Clemson University and The University of Chicago | |
3 | * | |
4 | * See COPYING in top-level directory. | |
5 | */ | |
6 | #include "protocol.h" | |
7 | #include "pvfs2-kernel.h" | |
8 | #include "pvfs2-bufmap.h" | |
9 | ||
10 | DECLARE_WAIT_QUEUE_HEAD(pvfs2_bufmap_init_waitq); | |
11 | ||
84d02150 | 12 | static struct pvfs2_bufmap { |
274dcf55 MM |
13 | atomic_t refcnt; |
14 | ||
15 | int desc_size; | |
16 | int desc_shift; | |
17 | int desc_count; | |
18 | int total_size; | |
19 | int page_count; | |
20 | ||
21 | struct page **page_array; | |
22 | struct pvfs_bufmap_desc *desc_array; | |
23 | ||
24 | /* array to track usage of buffer descriptors */ | |
25 | int *buffer_index_array; | |
26 | spinlock_t buffer_index_lock; | |
27 | ||
28 | /* array to track usage of buffer descriptors for readdir */ | |
29 | int readdir_index_array[PVFS2_READDIR_DEFAULT_DESC_COUNT]; | |
30 | spinlock_t readdir_index_lock; | |
31 | } *__pvfs2_bufmap; | |
32 | ||
33 | static DEFINE_SPINLOCK(pvfs2_bufmap_lock); | |
34 | ||
35 | static void | |
36 | pvfs2_bufmap_unmap(struct pvfs2_bufmap *bufmap) | |
37 | { | |
38 | int i; | |
39 | ||
40 | for (i = 0; i < bufmap->page_count; i++) | |
41 | page_cache_release(bufmap->page_array[i]); | |
42 | } | |
43 | ||
44 | static void | |
45 | pvfs2_bufmap_free(struct pvfs2_bufmap *bufmap) | |
46 | { | |
47 | kfree(bufmap->page_array); | |
48 | kfree(bufmap->desc_array); | |
49 | kfree(bufmap->buffer_index_array); | |
50 | kfree(bufmap); | |
51 | } | |
52 | ||
53 | struct pvfs2_bufmap *pvfs2_bufmap_ref(void) | |
54 | { | |
55 | struct pvfs2_bufmap *bufmap = NULL; | |
56 | ||
57 | spin_lock(&pvfs2_bufmap_lock); | |
58 | if (__pvfs2_bufmap) { | |
59 | bufmap = __pvfs2_bufmap; | |
60 | atomic_inc(&bufmap->refcnt); | |
61 | } | |
62 | spin_unlock(&pvfs2_bufmap_lock); | |
63 | return bufmap; | |
64 | } | |
65 | ||
66 | void pvfs2_bufmap_unref(struct pvfs2_bufmap *bufmap) | |
67 | { | |
68 | if (atomic_dec_and_lock(&bufmap->refcnt, &pvfs2_bufmap_lock)) { | |
69 | __pvfs2_bufmap = NULL; | |
70 | spin_unlock(&pvfs2_bufmap_lock); | |
71 | ||
72 | pvfs2_bufmap_unmap(bufmap); | |
73 | pvfs2_bufmap_free(bufmap); | |
74 | } | |
75 | } | |
76 | ||
77 | inline int pvfs_bufmap_size_query(void) | |
78 | { | |
79 | struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); | |
80 | int size = bufmap ? bufmap->desc_size : 0; | |
81 | ||
82 | pvfs2_bufmap_unref(bufmap); | |
83 | return size; | |
84 | } | |
85 | ||
86 | inline int pvfs_bufmap_shift_query(void) | |
87 | { | |
88 | struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); | |
89 | int shift = bufmap ? bufmap->desc_shift : 0; | |
90 | ||
91 | pvfs2_bufmap_unref(bufmap); | |
92 | return shift; | |
93 | } | |
94 | ||
95 | static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); | |
96 | static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); | |
97 | ||
98 | /* | |
99 | * get_bufmap_init | |
100 | * | |
101 | * If bufmap_init is 1, then the shared memory system, including the | |
102 | * buffer_index_array, is available. Otherwise, it is not. | |
103 | * | |
104 | * returns the value of bufmap_init | |
105 | */ | |
106 | int get_bufmap_init(void) | |
107 | { | |
108 | return __pvfs2_bufmap ? 1 : 0; | |
109 | } | |
110 | ||
111 | ||
112 | static struct pvfs2_bufmap * | |
113 | pvfs2_bufmap_alloc(struct PVFS_dev_map_desc *user_desc) | |
114 | { | |
115 | struct pvfs2_bufmap *bufmap; | |
116 | ||
117 | bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL); | |
118 | if (!bufmap) | |
119 | goto out; | |
120 | ||
121 | atomic_set(&bufmap->refcnt, 1); | |
122 | bufmap->total_size = user_desc->total_size; | |
123 | bufmap->desc_count = user_desc->count; | |
124 | bufmap->desc_size = user_desc->size; | |
125 | bufmap->desc_shift = ilog2(bufmap->desc_size); | |
126 | ||
127 | spin_lock_init(&bufmap->buffer_index_lock); | |
128 | bufmap->buffer_index_array = | |
129 | kcalloc(bufmap->desc_count, sizeof(int), GFP_KERNEL); | |
130 | if (!bufmap->buffer_index_array) { | |
131 | gossip_err("pvfs2: could not allocate %d buffer indices\n", | |
132 | bufmap->desc_count); | |
133 | goto out_free_bufmap; | |
134 | } | |
135 | spin_lock_init(&bufmap->readdir_index_lock); | |
136 | ||
137 | bufmap->desc_array = | |
138 | kcalloc(bufmap->desc_count, sizeof(struct pvfs_bufmap_desc), | |
139 | GFP_KERNEL); | |
140 | if (!bufmap->desc_array) { | |
141 | gossip_err("pvfs2: could not allocate %d descriptors\n", | |
142 | bufmap->desc_count); | |
143 | goto out_free_index_array; | |
144 | } | |
145 | ||
146 | bufmap->page_count = bufmap->total_size / PAGE_SIZE; | |
147 | ||
148 | /* allocate storage to track our page mappings */ | |
149 | bufmap->page_array = | |
150 | kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL); | |
151 | if (!bufmap->page_array) | |
152 | goto out_free_desc_array; | |
153 | ||
154 | return bufmap; | |
155 | ||
156 | out_free_desc_array: | |
157 | kfree(bufmap->desc_array); | |
158 | out_free_index_array: | |
159 | kfree(bufmap->buffer_index_array); | |
160 | out_free_bufmap: | |
161 | kfree(bufmap); | |
162 | out: | |
163 | return NULL; | |
164 | } | |
165 | ||
166 | static int | |
167 | pvfs2_bufmap_map(struct pvfs2_bufmap *bufmap, | |
168 | struct PVFS_dev_map_desc *user_desc) | |
169 | { | |
170 | int pages_per_desc = bufmap->desc_size / PAGE_SIZE; | |
171 | int offset = 0, ret, i; | |
172 | ||
173 | /* map the pages */ | |
174 | down_write(¤t->mm->mmap_sem); | |
175 | ret = get_user_pages(current, | |
176 | current->mm, | |
177 | (unsigned long)user_desc->ptr, | |
178 | bufmap->page_count, | |
179 | 1, | |
180 | 0, | |
181 | bufmap->page_array, | |
182 | NULL); | |
183 | up_write(¤t->mm->mmap_sem); | |
184 | ||
185 | if (ret < 0) | |
186 | return ret; | |
187 | ||
188 | if (ret != bufmap->page_count) { | |
189 | gossip_err("pvfs2 error: asked for %d pages, only got %d.\n", | |
190 | bufmap->page_count, ret); | |
191 | ||
192 | for (i = 0; i < ret; i++) { | |
193 | SetPageError(bufmap->page_array[i]); | |
194 | page_cache_release(bufmap->page_array[i]); | |
195 | } | |
196 | return -ENOMEM; | |
197 | } | |
198 | ||
199 | /* | |
200 | * ideally we want to get kernel space pointers for each page, but | |
201 | * we can't kmap that many pages at once if highmem is being used. | |
202 | * so instead, we just kmap/kunmap the page address each time the | |
203 | * kaddr is needed. | |
204 | */ | |
205 | for (i = 0; i < bufmap->page_count; i++) | |
206 | flush_dcache_page(bufmap->page_array[i]); | |
207 | ||
208 | /* build a list of available descriptors */ | |
209 | for (offset = 0, i = 0; i < bufmap->desc_count; i++) { | |
210 | bufmap->desc_array[i].page_array = &bufmap->page_array[offset]; | |
211 | bufmap->desc_array[i].array_count = pages_per_desc; | |
212 | bufmap->desc_array[i].uaddr = | |
213 | (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE)); | |
214 | offset += pages_per_desc; | |
215 | } | |
216 | ||
217 | return 0; | |
218 | } | |
219 | ||
220 | /* | |
221 | * pvfs_bufmap_initialize() | |
222 | * | |
223 | * initializes the mapped buffer interface | |
224 | * | |
225 | * returns 0 on success, -errno on failure | |
226 | */ | |
227 | int pvfs_bufmap_initialize(struct PVFS_dev_map_desc *user_desc) | |
228 | { | |
229 | struct pvfs2_bufmap *bufmap; | |
230 | int ret = -EINVAL; | |
231 | ||
232 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
233 | "pvfs_bufmap_initialize: called (ptr (" | |
234 | "%p) sz (%d) cnt(%d).\n", | |
235 | user_desc->ptr, | |
236 | user_desc->size, | |
237 | user_desc->count); | |
238 | ||
239 | /* | |
240 | * sanity check alignment and size of buffer that caller wants to | |
241 | * work with | |
242 | */ | |
243 | if (PAGE_ALIGN((unsigned long)user_desc->ptr) != | |
244 | (unsigned long)user_desc->ptr) { | |
245 | gossip_err("pvfs2 error: memory alignment (front). %p\n", | |
246 | user_desc->ptr); | |
247 | goto out; | |
248 | } | |
249 | ||
250 | if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size)) | |
251 | != (unsigned long)(user_desc->ptr + user_desc->total_size)) { | |
252 | gossip_err("pvfs2 error: memory alignment (back).(%p + %d)\n", | |
253 | user_desc->ptr, | |
254 | user_desc->total_size); | |
255 | goto out; | |
256 | } | |
257 | ||
258 | if (user_desc->total_size != (user_desc->size * user_desc->count)) { | |
259 | gossip_err("pvfs2 error: user provided an oddly sized buffer: (%d, %d, %d)\n", | |
260 | user_desc->total_size, | |
261 | user_desc->size, | |
262 | user_desc->count); | |
263 | goto out; | |
264 | } | |
265 | ||
266 | if ((user_desc->size % PAGE_SIZE) != 0) { | |
267 | gossip_err("pvfs2 error: bufmap size not page size divisible (%d).\n", | |
268 | user_desc->size); | |
269 | goto out; | |
270 | } | |
271 | ||
272 | ret = -ENOMEM; | |
273 | bufmap = pvfs2_bufmap_alloc(user_desc); | |
274 | if (!bufmap) | |
275 | goto out; | |
276 | ||
277 | ret = pvfs2_bufmap_map(bufmap, user_desc); | |
278 | if (ret) | |
279 | goto out_free_bufmap; | |
280 | ||
281 | ||
282 | spin_lock(&pvfs2_bufmap_lock); | |
283 | if (__pvfs2_bufmap) { | |
284 | spin_unlock(&pvfs2_bufmap_lock); | |
285 | gossip_err("pvfs2: error: bufmap already initialized.\n"); | |
286 | ret = -EALREADY; | |
287 | goto out_unmap_bufmap; | |
288 | } | |
289 | __pvfs2_bufmap = bufmap; | |
290 | spin_unlock(&pvfs2_bufmap_lock); | |
291 | ||
292 | /* | |
293 | * If there are operations in pvfs2_bufmap_init_waitq, wake them up. | |
294 | * This scenario occurs when the client-core is restarted and I/O | |
295 | * requests in the in-progress or waiting tables are restarted. I/O | |
296 | * requests cannot be restarted until the shared memory system is | |
297 | * completely re-initialized, so we put the I/O requests in this | |
298 | * waitq until initialization has completed. NOTE: the I/O requests | |
299 | * are also on a timer, so they don't wait forever just in case the | |
300 | * client-core doesn't come back up. | |
301 | */ | |
302 | wake_up_interruptible(&pvfs2_bufmap_init_waitq); | |
303 | ||
304 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
305 | "pvfs_bufmap_initialize: exiting normally\n"); | |
306 | return 0; | |
307 | ||
308 | out_unmap_bufmap: | |
309 | pvfs2_bufmap_unmap(bufmap); | |
310 | out_free_bufmap: | |
311 | pvfs2_bufmap_free(bufmap); | |
312 | out: | |
313 | return ret; | |
314 | } | |
315 | ||
316 | /* | |
317 | * pvfs_bufmap_finalize() | |
318 | * | |
319 | * shuts down the mapped buffer interface and releases any resources | |
320 | * associated with it | |
321 | * | |
322 | * no return value | |
323 | */ | |
324 | void pvfs_bufmap_finalize(void) | |
325 | { | |
326 | gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2_bufmap_finalize: called\n"); | |
327 | BUG_ON(!__pvfs2_bufmap); | |
328 | pvfs2_bufmap_unref(__pvfs2_bufmap); | |
329 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
330 | "pvfs2_bufmap_finalize: exiting normally\n"); | |
331 | } | |
332 | ||
333 | struct slot_args { | |
334 | int slot_count; | |
335 | int *slot_array; | |
336 | spinlock_t *slot_lock; | |
337 | wait_queue_head_t *slot_wq; | |
338 | }; | |
339 | ||
340 | static int wait_for_a_slot(struct slot_args *slargs, int *buffer_index) | |
341 | { | |
342 | int ret = -1; | |
343 | int i = 0; | |
344 | DECLARE_WAITQUEUE(my_wait, current); | |
345 | ||
346 | ||
347 | add_wait_queue_exclusive(slargs->slot_wq, &my_wait); | |
348 | ||
349 | while (1) { | |
350 | set_current_state(TASK_INTERRUPTIBLE); | |
351 | ||
352 | /* | |
353 | * check for available desc, slot_lock is the appropriate | |
354 | * index_lock | |
355 | */ | |
356 | spin_lock(slargs->slot_lock); | |
357 | for (i = 0; i < slargs->slot_count; i++) | |
358 | if (slargs->slot_array[i] == 0) { | |
359 | slargs->slot_array[i] = 1; | |
360 | *buffer_index = i; | |
361 | ret = 0; | |
362 | break; | |
363 | } | |
364 | spin_unlock(slargs->slot_lock); | |
365 | ||
366 | /* if we acquired a buffer, then break out of while */ | |
367 | if (ret == 0) | |
368 | break; | |
369 | ||
370 | if (!signal_pending(current)) { | |
371 | int timeout = | |
372 | MSECS_TO_JIFFIES(1000 * slot_timeout_secs); | |
373 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
374 | "[BUFMAP]: waiting %d " | |
375 | "seconds for a slot\n", | |
376 | slot_timeout_secs); | |
377 | if (!schedule_timeout(timeout)) { | |
378 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
379 | "*** wait_for_a_slot timed out\n"); | |
380 | ret = -ETIMEDOUT; | |
381 | break; | |
382 | } | |
383 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
384 | "[BUFMAP]: woken up by a slot becoming available.\n"); | |
385 | continue; | |
386 | } | |
387 | ||
388 | gossip_debug(GOSSIP_BUFMAP_DEBUG, "pvfs2: %s interrupted.\n", | |
389 | __func__); | |
390 | ret = -EINTR; | |
391 | break; | |
392 | } | |
393 | ||
394 | set_current_state(TASK_RUNNING); | |
395 | remove_wait_queue(slargs->slot_wq, &my_wait); | |
396 | return ret; | |
397 | } | |
398 | ||
399 | static void put_back_slot(struct slot_args *slargs, int buffer_index) | |
400 | { | |
401 | /* slot_lock is the appropriate index_lock */ | |
402 | spin_lock(slargs->slot_lock); | |
403 | if (buffer_index < 0 || buffer_index >= slargs->slot_count) { | |
404 | spin_unlock(slargs->slot_lock); | |
405 | return; | |
406 | } | |
407 | ||
408 | /* put the desc back on the queue */ | |
409 | slargs->slot_array[buffer_index] = 0; | |
410 | spin_unlock(slargs->slot_lock); | |
411 | ||
412 | /* wake up anyone who may be sleeping on the queue */ | |
413 | wake_up_interruptible(slargs->slot_wq); | |
414 | } | |
415 | ||
416 | /* | |
417 | * pvfs_bufmap_get() | |
418 | * | |
419 | * gets a free mapped buffer descriptor, will sleep until one becomes | |
420 | * available if necessary | |
421 | * | |
422 | * returns 0 on success, -errno on failure | |
423 | */ | |
424 | int pvfs_bufmap_get(struct pvfs2_bufmap **mapp, int *buffer_index) | |
425 | { | |
426 | struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); | |
427 | struct slot_args slargs; | |
428 | int ret; | |
429 | ||
430 | if (!bufmap) { | |
431 | gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n"); | |
432 | return -EIO; | |
433 | } | |
434 | ||
435 | slargs.slot_count = bufmap->desc_count; | |
436 | slargs.slot_array = bufmap->buffer_index_array; | |
437 | slargs.slot_lock = &bufmap->buffer_index_lock; | |
438 | slargs.slot_wq = &bufmap_waitq; | |
439 | ret = wait_for_a_slot(&slargs, buffer_index); | |
440 | if (ret) | |
441 | pvfs2_bufmap_unref(bufmap); | |
442 | *mapp = bufmap; | |
443 | return ret; | |
444 | } | |
445 | ||
446 | /* | |
447 | * pvfs_bufmap_put() | |
448 | * | |
449 | * returns a mapped buffer descriptor to the collection | |
450 | * | |
451 | * no return value | |
452 | */ | |
453 | void pvfs_bufmap_put(struct pvfs2_bufmap *bufmap, int buffer_index) | |
454 | { | |
455 | struct slot_args slargs; | |
456 | ||
457 | slargs.slot_count = bufmap->desc_count; | |
458 | slargs.slot_array = bufmap->buffer_index_array; | |
459 | slargs.slot_lock = &bufmap->buffer_index_lock; | |
460 | slargs.slot_wq = &bufmap_waitq; | |
461 | put_back_slot(&slargs, buffer_index); | |
462 | pvfs2_bufmap_unref(bufmap); | |
463 | } | |
464 | ||
465 | /* | |
466 | * readdir_index_get() | |
467 | * | |
468 | * gets a free descriptor, will sleep until one becomes | |
469 | * available if necessary. | |
470 | * Although the readdir buffers are not mapped into kernel space | |
471 | * we could do that at a later point of time. Regardless, these | |
472 | * indices are used by the client-core. | |
473 | * | |
474 | * returns 0 on success, -errno on failure | |
475 | */ | |
476 | int readdir_index_get(struct pvfs2_bufmap **mapp, int *buffer_index) | |
477 | { | |
478 | struct pvfs2_bufmap *bufmap = pvfs2_bufmap_ref(); | |
479 | struct slot_args slargs; | |
480 | int ret; | |
481 | ||
482 | if (!bufmap) { | |
483 | gossip_err("pvfs2: please confirm that pvfs2-client daemon is running.\n"); | |
484 | return -EIO; | |
485 | } | |
486 | ||
487 | slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT; | |
488 | slargs.slot_array = bufmap->readdir_index_array; | |
489 | slargs.slot_lock = &bufmap->readdir_index_lock; | |
490 | slargs.slot_wq = &readdir_waitq; | |
491 | ret = wait_for_a_slot(&slargs, buffer_index); | |
492 | if (ret) | |
493 | pvfs2_bufmap_unref(bufmap); | |
494 | *mapp = bufmap; | |
495 | return ret; | |
496 | } | |
497 | ||
498 | void readdir_index_put(struct pvfs2_bufmap *bufmap, int buffer_index) | |
499 | { | |
500 | struct slot_args slargs; | |
501 | ||
502 | slargs.slot_count = PVFS2_READDIR_DEFAULT_DESC_COUNT; | |
503 | slargs.slot_array = bufmap->readdir_index_array; | |
504 | slargs.slot_lock = &bufmap->readdir_index_lock; | |
505 | slargs.slot_wq = &readdir_waitq; | |
506 | put_back_slot(&slargs, buffer_index); | |
507 | pvfs2_bufmap_unref(bufmap); | |
508 | } | |
509 | ||
4d1c4404 | 510 | int pvfs_bufmap_copy_from_iovec(struct pvfs2_bufmap *bufmap, |
54804949 MM |
511 | struct iov_iter *iter, |
512 | int buffer_index, | |
513 | size_t size) | |
274dcf55 | 514 | { |
274dcf55 | 515 | struct pvfs_bufmap_desc *to; |
4d1c4404 MM |
516 | struct page *page; |
517 | size_t copied; | |
518 | int i; | |
274dcf55 MM |
519 | |
520 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
4d1c4404 MM |
521 | "%s: buffer_index:%d: size:%lu:\n", |
522 | __func__, buffer_index, size); | |
274dcf55 MM |
523 | |
524 | to = &bufmap->desc_array[buffer_index]; | |
274dcf55 | 525 | |
4d1c4404 MM |
526 | for (i = 0; size; i++) { |
527 | page = to->page_array[i]; | |
528 | copied = copy_page_from_iter(page, 0, PAGE_SIZE, iter); | |
529 | size -= copied; | |
530 | if ((copied == 0) && (size)) | |
531 | break; | |
274dcf55 MM |
532 | } |
533 | ||
4d1c4404 | 534 | return size ? -EFAULT : 0; |
274dcf55 | 535 | |
274dcf55 MM |
536 | } |
537 | ||
538 | /* | |
4d1c4404 MM |
539 | * Iterate through the array of pages containing the bytes from |
540 | * a file being read. | |
274dcf55 | 541 | * |
274dcf55 | 542 | */ |
4d1c4404 MM |
543 | int pvfs_bufmap_copy_to_iovec(struct pvfs2_bufmap *bufmap, |
544 | struct iov_iter *iter, | |
545 | int buffer_index) | |
274dcf55 | 546 | { |
274dcf55 | 547 | struct pvfs_bufmap_desc *from; |
4d1c4404 MM |
548 | struct page *page; |
549 | int i; | |
550 | size_t written; | |
274dcf55 MM |
551 | |
552 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | |
4d1c4404 MM |
553 | "%s: buffer_index:%d: iov_iter_count(iter):%lu:\n", |
554 | __func__, buffer_index, iov_iter_count(iter)); | |
274dcf55 | 555 | |
54804949 | 556 | from = &bufmap->desc_array[buffer_index]; |
274dcf55 | 557 | |
4d1c4404 MM |
558 | for (i = 0; iov_iter_count(iter); i++) { |
559 | page = from->page_array[i]; | |
560 | written = copy_page_to_iter(page, 0, PAGE_SIZE, iter); | |
561 | if ((written == 0) && (iov_iter_count(iter))) | |
562 | break; | |
274dcf55 MM |
563 | } |
564 | ||
54804949 | 565 | return iov_iter_count(iter) ? -EFAULT : 0; |
274dcf55 | 566 | } |