]>
Commit | Line | Data |
---|---|---|
8ada2c1c SR |
1 | /* |
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | |
3 | * | |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the | |
8 | * OpenIB.org BSD license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or | |
11 | * without modification, are permitted provided that the following | |
12 | * conditions are met: | |
13 | * | |
14 | * - Redistributions of source code must retain the above | |
15 | * copyright notice, this list of conditions and the following | |
16 | * disclaimer. | |
17 | * | |
18 | * - Redistributions in binary form must reproduce the above | |
19 | * copyright notice, this list of conditions and the following | |
20 | * disclaimer in the documentation and/or other materials | |
21 | * provided with the distribution. | |
22 | * | |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | * SOFTWARE. | |
31 | */ | |
32 | ||
33 | #include <linux/types.h> | |
34 | #include <linux/sched.h> | |
6e84f315 | 35 | #include <linux/sched/mm.h> |
0881e7bd | 36 | #include <linux/sched/task.h> |
8ada2c1c SR |
37 | #include <linux/pid.h> |
38 | #include <linux/slab.h> | |
39 | #include <linux/export.h> | |
40 | #include <linux/vmalloc.h> | |
0008b84e | 41 | #include <linux/hugetlb.h> |
fec99ede | 42 | #include <linux/interval_tree_generic.h> |
8ada2c1c SR |
43 | |
44 | #include <rdma/ib_verbs.h> | |
45 | #include <rdma/ib_umem.h> | |
46 | #include <rdma/ib_umem_odp.h> | |
47 | ||
fec99ede LR |
48 | /* |
49 | * The ib_umem list keeps track of memory regions for which the HW | |
50 | * device request to receive notification when the related memory | |
51 | * mapping is changed. | |
52 | * | |
53 | * ib_umem_lock protects the list. | |
54 | */ | |
55 | ||
56 | static u64 node_start(struct umem_odp_node *n) | |
57 | { | |
58 | struct ib_umem_odp *umem_odp = | |
59 | container_of(n, struct ib_umem_odp, interval_tree); | |
60 | ||
41b4deea | 61 | return ib_umem_start(&umem_odp->umem); |
fec99ede LR |
62 | } |
63 | ||
64 | /* Note that the representation of the intervals in the interval tree | |
65 | * considers the ending point as contained in the interval, while the | |
66 | * function ib_umem_end returns the first address which is not contained | |
67 | * in the umem. | |
68 | */ | |
69 | static u64 node_last(struct umem_odp_node *n) | |
70 | { | |
71 | struct ib_umem_odp *umem_odp = | |
72 | container_of(n, struct ib_umem_odp, interval_tree); | |
73 | ||
41b4deea | 74 | return ib_umem_end(&umem_odp->umem) - 1; |
fec99ede LR |
75 | } |
76 | ||
77 | INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, | |
78 | node_start, node_last, static, rbt_ib_umem) | |
79 | ||
b5231b01 | 80 | static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) |
882214e2 | 81 | { |
b5231b01 | 82 | mutex_lock(&umem_odp->umem_mutex); |
ca748c39 JG |
83 | if (umem_odp->notifiers_count++ == 0) |
84 | /* | |
85 | * Initialize the completion object for waiting on | |
86 | * notifiers. Since notifier_count is zero, no one should be | |
87 | * waiting right now. | |
88 | */ | |
89 | reinit_completion(&umem_odp->notifier_completion); | |
b5231b01 | 90 | mutex_unlock(&umem_odp->umem_mutex); |
882214e2 HE |
91 | } |
92 | ||
b5231b01 | 93 | static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) |
882214e2 | 94 | { |
b5231b01 | 95 | mutex_lock(&umem_odp->umem_mutex); |
ca748c39 JG |
96 | /* |
97 | * This sequence increase will notify the QP page fault that the page | |
98 | * that is going to be mapped in the spte could have been freed. | |
99 | */ | |
100 | ++umem_odp->notifiers_seq; | |
101 | if (--umem_odp->notifiers_count == 0) | |
102 | complete_all(&umem_odp->notifier_completion); | |
b5231b01 | 103 | mutex_unlock(&umem_odp->umem_mutex); |
882214e2 HE |
104 | } |
105 | ||
b5231b01 JG |
106 | static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, |
107 | u64 start, u64 end, void *cookie) | |
108 | { | |
41b4deea | 109 | struct ib_umem *umem = &umem_odp->umem; |
b5231b01 | 110 | |
882214e2 HE |
111 | /* |
112 | * Increase the number of notifiers running, to | |
113 | * prevent any further fault handling on this MR. | |
114 | */ | |
b5231b01 JG |
115 | ib_umem_notifier_start_account(umem_odp); |
116 | umem_odp->dying = 1; | |
882214e2 HE |
117 | /* Make sure that the fact the umem is dying is out before we release |
118 | * all pending page faults. */ | |
119 | smp_wmb(); | |
b5231b01 JG |
120 | complete_all(&umem_odp->notifier_completion); |
121 | umem->context->invalidate_range(umem_odp, ib_umem_start(umem), | |
122 | ib_umem_end(umem)); | |
882214e2 HE |
123 | return 0; |
124 | } | |
125 | ||
126 | static void ib_umem_notifier_release(struct mmu_notifier *mn, | |
127 | struct mm_struct *mm) | |
128 | { | |
c9990ab3 JG |
129 | struct ib_ucontext_per_mm *per_mm = |
130 | container_of(mn, struct ib_ucontext_per_mm, mn); | |
882214e2 | 131 | |
c9990ab3 | 132 | down_read(&per_mm->umem_rwsem); |
be7a57b4 JG |
133 | if (per_mm->active) |
134 | rbt_ib_umem_for_each_in_range( | |
135 | &per_mm->umem_tree, 0, ULLONG_MAX, | |
136 | ib_umem_notifier_release_trampoline, true, NULL); | |
c9990ab3 | 137 | up_read(&per_mm->umem_rwsem); |
882214e2 HE |
138 | } |
139 | ||
b5231b01 JG |
140 | static int invalidate_range_start_trampoline(struct ib_umem_odp *item, |
141 | u64 start, u64 end, void *cookie) | |
882214e2 HE |
142 | { |
143 | ib_umem_notifier_start_account(item); | |
41b4deea | 144 | item->umem.context->invalidate_range(item, start, end); |
882214e2 HE |
145 | return 0; |
146 | } | |
147 | ||
93065ac7 | 148 | static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, |
882214e2 HE |
149 | struct mm_struct *mm, |
150 | unsigned long start, | |
93065ac7 MH |
151 | unsigned long end, |
152 | bool blockable) | |
882214e2 | 153 | { |
c9990ab3 JG |
154 | struct ib_ucontext_per_mm *per_mm = |
155 | container_of(mn, struct ib_ucontext_per_mm, mn); | |
93065ac7 MH |
156 | |
157 | if (blockable) | |
c9990ab3 JG |
158 | down_read(&per_mm->umem_rwsem); |
159 | else if (!down_read_trylock(&per_mm->umem_rwsem)) | |
93065ac7 | 160 | return -EAGAIN; |
882214e2 | 161 | |
be7a57b4 JG |
162 | if (!per_mm->active) { |
163 | up_read(&per_mm->umem_rwsem); | |
164 | /* | |
165 | * At this point active is permanently set and visible to this | |
166 | * CPU without a lock, that fact is relied on to skip the unlock | |
167 | * in range_end. | |
168 | */ | |
169 | return 0; | |
170 | } | |
171 | ||
ca748c39 JG |
172 | return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, |
173 | invalidate_range_start_trampoline, | |
174 | blockable, NULL); | |
882214e2 HE |
175 | } |
176 | ||
b5231b01 | 177 | static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, |
882214e2 HE |
178 | u64 end, void *cookie) |
179 | { | |
180 | ib_umem_notifier_end_account(item); | |
181 | return 0; | |
182 | } | |
183 | ||
184 | static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, | |
185 | struct mm_struct *mm, | |
186 | unsigned long start, | |
187 | unsigned long end) | |
188 | { | |
c9990ab3 JG |
189 | struct ib_ucontext_per_mm *per_mm = |
190 | container_of(mn, struct ib_ucontext_per_mm, mn); | |
882214e2 | 191 | |
be7a57b4 | 192 | if (unlikely(!per_mm->active)) |
882214e2 HE |
193 | return; |
194 | ||
c9990ab3 | 195 | rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, |
882214e2 | 196 | end, |
93065ac7 | 197 | invalidate_range_end_trampoline, true, NULL); |
c9990ab3 | 198 | up_read(&per_mm->umem_rwsem); |
882214e2 HE |
199 | } |
200 | ||
46e741f4 | 201 | static const struct mmu_notifier_ops ib_umem_notifiers = { |
882214e2 | 202 | .release = ib_umem_notifier_release, |
882214e2 HE |
203 | .invalidate_range_start = ib_umem_notifier_invalidate_range_start, |
204 | .invalidate_range_end = ib_umem_notifier_invalidate_range_end, | |
205 | }; | |
206 | ||
f27a0d50 JG |
207 | static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) |
208 | { | |
209 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | |
210 | struct ib_umem *umem = &umem_odp->umem; | |
211 | ||
212 | down_write(&per_mm->umem_rwsem); | |
213 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | |
214 | rbt_ib_umem_insert(&umem_odp->interval_tree, | |
215 | &per_mm->umem_tree); | |
f27a0d50 JG |
216 | up_write(&per_mm->umem_rwsem); |
217 | } | |
218 | ||
219 | static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) | |
220 | { | |
221 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | |
222 | struct ib_umem *umem = &umem_odp->umem; | |
223 | ||
224 | down_write(&per_mm->umem_rwsem); | |
225 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | |
226 | rbt_ib_umem_remove(&umem_odp->interval_tree, | |
227 | &per_mm->umem_tree); | |
ca748c39 | 228 | complete_all(&umem_odp->notifier_completion); |
f27a0d50 JG |
229 | |
230 | up_write(&per_mm->umem_rwsem); | |
231 | } | |
232 | ||
233 | static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, | |
234 | struct mm_struct *mm) | |
d07d1d70 | 235 | { |
c9990ab3 | 236 | struct ib_ucontext_per_mm *per_mm; |
f27a0d50 JG |
237 | int ret; |
238 | ||
239 | per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); | |
240 | if (!per_mm) | |
241 | return ERR_PTR(-ENOMEM); | |
242 | ||
243 | per_mm->context = ctx; | |
244 | per_mm->mm = mm; | |
245 | per_mm->umem_tree = RB_ROOT_CACHED; | |
246 | init_rwsem(&per_mm->umem_rwsem); | |
be7a57b4 | 247 | per_mm->active = ctx->invalidate_range; |
f27a0d50 JG |
248 | |
249 | rcu_read_lock(); | |
250 | per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); | |
251 | rcu_read_unlock(); | |
252 | ||
253 | WARN_ON(mm != current->mm); | |
254 | ||
255 | per_mm->mn.ops = &ib_umem_notifiers; | |
256 | ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); | |
257 | if (ret) { | |
258 | dev_err(&ctx->device->dev, | |
259 | "Failed to register mmu_notifier %d\n", ret); | |
260 | goto out_pid; | |
261 | } | |
262 | ||
263 | list_add(&per_mm->ucontext_list, &ctx->per_mm_list); | |
264 | return per_mm; | |
265 | ||
266 | out_pid: | |
267 | put_pid(per_mm->tgid); | |
268 | kfree(per_mm); | |
269 | return ERR_PTR(ret); | |
270 | } | |
271 | ||
272 | static int get_per_mm(struct ib_umem_odp *umem_odp) | |
273 | { | |
274 | struct ib_ucontext *ctx = umem_odp->umem.context; | |
275 | struct ib_ucontext_per_mm *per_mm; | |
276 | ||
277 | /* | |
278 | * Generally speaking we expect only one or two per_mm in this list, | |
279 | * so no reason to optimize this search today. | |
280 | */ | |
281 | mutex_lock(&ctx->per_mm_list_lock); | |
282 | list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { | |
283 | if (per_mm->mm == umem_odp->umem.owning_mm) | |
284 | goto found; | |
285 | } | |
286 | ||
287 | per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); | |
288 | if (IS_ERR(per_mm)) { | |
289 | mutex_unlock(&ctx->per_mm_list_lock); | |
290 | return PTR_ERR(per_mm); | |
291 | } | |
292 | ||
293 | found: | |
294 | umem_odp->per_mm = per_mm; | |
295 | per_mm->odp_mrs_count++; | |
296 | mutex_unlock(&ctx->per_mm_list_lock); | |
297 | ||
298 | return 0; | |
299 | } | |
300 | ||
56ac9dd9 JG |
301 | static void free_per_mm(struct rcu_head *rcu) |
302 | { | |
303 | kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); | |
304 | } | |
305 | ||
f27a0d50 JG |
306 | void put_per_mm(struct ib_umem_odp *umem_odp) |
307 | { | |
308 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | |
309 | struct ib_ucontext *ctx = umem_odp->umem.context; | |
310 | bool need_free; | |
311 | ||
312 | mutex_lock(&ctx->per_mm_list_lock); | |
313 | umem_odp->per_mm = NULL; | |
314 | per_mm->odp_mrs_count--; | |
315 | need_free = per_mm->odp_mrs_count == 0; | |
316 | if (need_free) | |
317 | list_del(&per_mm->ucontext_list); | |
318 | mutex_unlock(&ctx->per_mm_list_lock); | |
319 | ||
320 | if (!need_free) | |
321 | return; | |
322 | ||
be7a57b4 JG |
323 | /* |
324 | * NOTE! mmu_notifier_unregister() can happen between a start/end | |
325 | * callback, resulting in an start/end, and thus an unbalanced | |
326 | * lock. This doesn't really matter to us since we are about to kfree | |
327 | * the memory that holds the lock, however LOCKDEP doesn't like this. | |
328 | */ | |
329 | down_write(&per_mm->umem_rwsem); | |
330 | per_mm->active = false; | |
331 | up_write(&per_mm->umem_rwsem); | |
332 | ||
56ac9dd9 JG |
333 | WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); |
334 | mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); | |
f27a0d50 | 335 | put_pid(per_mm->tgid); |
56ac9dd9 | 336 | mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); |
f27a0d50 JG |
337 | } |
338 | ||
339 | struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, | |
340 | unsigned long addr, size_t size) | |
341 | { | |
342 | struct ib_ucontext *ctx = per_mm->context; | |
d07d1d70 | 343 | struct ib_umem_odp *odp_data; |
41b4deea | 344 | struct ib_umem *umem; |
d07d1d70 AK |
345 | int pages = size >> PAGE_SHIFT; |
346 | int ret; | |
347 | ||
41b4deea JG |
348 | odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); |
349 | if (!odp_data) | |
d07d1d70 | 350 | return ERR_PTR(-ENOMEM); |
41b4deea | 351 | umem = &odp_data->umem; |
f27a0d50 | 352 | umem->context = ctx; |
3e7e1193 AK |
353 | umem->length = size; |
354 | umem->address = addr; | |
355 | umem->page_shift = PAGE_SHIFT; | |
356 | umem->writable = 1; | |
597ecc5a | 357 | umem->is_odp = 1; |
f27a0d50 | 358 | odp_data->per_mm = per_mm; |
d07d1d70 | 359 | |
d07d1d70 AK |
360 | mutex_init(&odp_data->umem_mutex); |
361 | init_completion(&odp_data->notifier_completion); | |
362 | ||
fad953ce KC |
363 | odp_data->page_list = |
364 | vzalloc(array_size(pages, sizeof(*odp_data->page_list))); | |
d07d1d70 AK |
365 | if (!odp_data->page_list) { |
366 | ret = -ENOMEM; | |
367 | goto out_odp_data; | |
368 | } | |
369 | ||
fad953ce KC |
370 | odp_data->dma_list = |
371 | vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); | |
d07d1d70 AK |
372 | if (!odp_data->dma_list) { |
373 | ret = -ENOMEM; | |
374 | goto out_page_list; | |
375 | } | |
376 | ||
f27a0d50 JG |
377 | /* |
378 | * Caller must ensure that the umem_odp that the per_mm came from | |
379 | * cannot be freed during the call to ib_alloc_odp_umem. | |
380 | */ | |
381 | mutex_lock(&ctx->per_mm_list_lock); | |
c9990ab3 | 382 | per_mm->odp_mrs_count++; |
f27a0d50 JG |
383 | mutex_unlock(&ctx->per_mm_list_lock); |
384 | add_umem_to_per_mm(odp_data); | |
d07d1d70 | 385 | |
b5231b01 | 386 | return odp_data; |
d07d1d70 AK |
387 | |
388 | out_page_list: | |
389 | vfree(odp_data->page_list); | |
390 | out_odp_data: | |
391 | kfree(odp_data); | |
d07d1d70 AK |
392 | return ERR_PTR(ret); |
393 | } | |
394 | EXPORT_SYMBOL(ib_alloc_odp_umem); | |
395 | ||
41b4deea | 396 | int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) |
8ada2c1c | 397 | { |
41b4deea | 398 | struct ib_umem *umem = &umem_odp->umem; |
f27a0d50 JG |
399 | /* |
400 | * NOTE: This must called in a process context where umem->owning_mm | |
401 | * == current->mm | |
402 | */ | |
403 | struct mm_struct *mm = umem->owning_mm; | |
8ada2c1c | 404 | int ret_val; |
8ada2c1c | 405 | |
0008b84e AK |
406 | if (access & IB_ACCESS_HUGETLB) { |
407 | struct vm_area_struct *vma; | |
408 | struct hstate *h; | |
409 | ||
79bb5b7e | 410 | down_read(&mm->mmap_sem); |
0008b84e | 411 | vma = find_vma(mm, ib_umem_start(umem)); |
79bb5b7e LR |
412 | if (!vma || !is_vm_hugetlb_page(vma)) { |
413 | up_read(&mm->mmap_sem); | |
0008b84e | 414 | return -EINVAL; |
79bb5b7e | 415 | } |
0008b84e AK |
416 | h = hstate_vma(vma); |
417 | umem->page_shift = huge_page_shift(h); | |
79bb5b7e | 418 | up_read(&mm->mmap_sem); |
0008b84e AK |
419 | umem->hugetlb = 1; |
420 | } else { | |
421 | umem->hugetlb = 0; | |
422 | } | |
423 | ||
41b4deea | 424 | mutex_init(&umem_odp->umem_mutex); |
8ada2c1c | 425 | |
41b4deea | 426 | init_completion(&umem_odp->notifier_completion); |
882214e2 | 427 | |
d07d1d70 | 428 | if (ib_umem_num_pages(umem)) { |
41b4deea JG |
429 | umem_odp->page_list = |
430 | vzalloc(array_size(sizeof(*umem_odp->page_list), | |
fad953ce | 431 | ib_umem_num_pages(umem))); |
f27a0d50 JG |
432 | if (!umem_odp->page_list) |
433 | return -ENOMEM; | |
8ada2c1c | 434 | |
41b4deea JG |
435 | umem_odp->dma_list = |
436 | vzalloc(array_size(sizeof(*umem_odp->dma_list), | |
fad953ce | 437 | ib_umem_num_pages(umem))); |
41b4deea | 438 | if (!umem_odp->dma_list) { |
d07d1d70 AK |
439 | ret_val = -ENOMEM; |
440 | goto out_page_list; | |
441 | } | |
8ada2c1c SR |
442 | } |
443 | ||
f27a0d50 JG |
444 | ret_val = get_per_mm(umem_odp); |
445 | if (ret_val) | |
446 | goto out_dma_list; | |
447 | add_umem_to_per_mm(umem_odp); | |
882214e2 | 448 | |
8ada2c1c SR |
449 | return 0; |
450 | ||
f27a0d50 | 451 | out_dma_list: |
41b4deea | 452 | vfree(umem_odp->dma_list); |
8ada2c1c | 453 | out_page_list: |
41b4deea | 454 | vfree(umem_odp->page_list); |
8ada2c1c SR |
455 | return ret_val; |
456 | } | |
457 | ||
b5231b01 | 458 | void ib_umem_odp_release(struct ib_umem_odp *umem_odp) |
8ada2c1c | 459 | { |
41b4deea | 460 | struct ib_umem *umem = &umem_odp->umem; |
882214e2 | 461 | |
8ada2c1c SR |
462 | /* |
463 | * Ensure that no more pages are mapped in the umem. | |
464 | * | |
465 | * It is the driver's responsibility to ensure, before calling us, | |
466 | * that the hardware will not attempt to access the MR any more. | |
467 | */ | |
b5231b01 | 468 | ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), |
8ada2c1c SR |
469 | ib_umem_end(umem)); |
470 | ||
f27a0d50 JG |
471 | remove_umem_from_per_mm(umem_odp); |
472 | put_per_mm(umem_odp); | |
b5231b01 JG |
473 | vfree(umem_odp->dma_list); |
474 | vfree(umem_odp->page_list); | |
8ada2c1c SR |
475 | } |
476 | ||
477 | /* | |
478 | * Map for DMA and insert a single page into the on-demand paging page tables. | |
479 | * | |
480 | * @umem: the umem to insert the page to. | |
481 | * @page_index: index in the umem to add the page to. | |
482 | * @page: the page struct to map and add. | |
483 | * @access_mask: access permissions needed for this page. | |
484 | * @current_seq: sequence number for synchronization with invalidations. | |
485 | * the sequence number is taken from | |
b5231b01 | 486 | * umem_odp->notifiers_seq. |
8ada2c1c | 487 | * |
882214e2 HE |
488 | * The function returns -EFAULT if the DMA mapping operation fails. It returns |
489 | * -EAGAIN if a concurrent invalidation prevents us from updating the page. | |
8ada2c1c SR |
490 | * |
491 | * The page is released via put_page even if the operation failed. For | |
492 | * on-demand pinning, the page is released whenever it isn't stored in the | |
493 | * umem. | |
494 | */ | |
495 | static int ib_umem_odp_map_dma_single_page( | |
b5231b01 | 496 | struct ib_umem_odp *umem_odp, |
8ada2c1c SR |
497 | int page_index, |
498 | struct page *page, | |
499 | u64 access_mask, | |
500 | unsigned long current_seq) | |
501 | { | |
41b4deea | 502 | struct ib_umem *umem = &umem_odp->umem; |
8ada2c1c SR |
503 | struct ib_device *dev = umem->context->device; |
504 | dma_addr_t dma_addr; | |
505 | int stored_page = 0; | |
882214e2 | 506 | int remove_existing_mapping = 0; |
8ada2c1c SR |
507 | int ret = 0; |
508 | ||
882214e2 HE |
509 | /* |
510 | * Note: we avoid writing if seq is different from the initial seq, to | |
511 | * handle case of a racing notifier. This check also allows us to bail | |
512 | * early if we have a notifier running in parallel with us. | |
513 | */ | |
b5231b01 | 514 | if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { |
882214e2 HE |
515 | ret = -EAGAIN; |
516 | goto out; | |
517 | } | |
b5231b01 | 518 | if (!(umem_odp->dma_list[page_index])) { |
8ada2c1c SR |
519 | dma_addr = ib_dma_map_page(dev, |
520 | page, | |
403cd12e | 521 | 0, BIT(umem->page_shift), |
8ada2c1c SR |
522 | DMA_BIDIRECTIONAL); |
523 | if (ib_dma_mapping_error(dev, dma_addr)) { | |
524 | ret = -EFAULT; | |
525 | goto out; | |
526 | } | |
b5231b01 JG |
527 | umem_odp->dma_list[page_index] = dma_addr | access_mask; |
528 | umem_odp->page_list[page_index] = page; | |
d07d1d70 | 529 | umem->npages++; |
8ada2c1c | 530 | stored_page = 1; |
b5231b01 JG |
531 | } else if (umem_odp->page_list[page_index] == page) { |
532 | umem_odp->dma_list[page_index] |= access_mask; | |
8ada2c1c SR |
533 | } else { |
534 | pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", | |
b5231b01 | 535 | umem_odp->page_list[page_index], page); |
882214e2 HE |
536 | /* Better remove the mapping now, to prevent any further |
537 | * damage. */ | |
538 | remove_existing_mapping = 1; | |
8ada2c1c SR |
539 | } |
540 | ||
541 | out: | |
882214e2 HE |
542 | /* On Demand Paging - avoid pinning the page */ |
543 | if (umem->context->invalidate_range || !stored_page) | |
8ada2c1c SR |
544 | put_page(page); |
545 | ||
882214e2 | 546 | if (remove_existing_mapping && umem->context->invalidate_range) { |
605728e6 AK |
547 | ib_umem_notifier_start_account(umem_odp); |
548 | umem->context->invalidate_range( | |
b5231b01 | 549 | umem_odp, |
605728e6 AK |
550 | ib_umem_start(umem) + (page_index << umem->page_shift), |
551 | ib_umem_start(umem) + | |
552 | ((page_index + 1) << umem->page_shift)); | |
553 | ib_umem_notifier_end_account(umem_odp); | |
882214e2 HE |
554 | ret = -EAGAIN; |
555 | } | |
556 | ||
8ada2c1c SR |
557 | return ret; |
558 | } | |
559 | ||
560 | /** | |
561 | * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. | |
562 | * | |
563 | * Pins the range of pages passed in the argument, and maps them to | |
564 | * DMA addresses. The DMA addresses of the mapped pages is updated in | |
b5231b01 | 565 | * umem_odp->dma_list. |
8ada2c1c SR |
566 | * |
567 | * Returns the number of pages mapped in success, negative error code | |
568 | * for failure. | |
882214e2 HE |
569 | * An -EAGAIN error code is returned when a concurrent mmu notifier prevents |
570 | * the function from completing its task. | |
d9d0674c AK |
571 | * An -ENOENT error code indicates that userspace process is being terminated |
572 | * and mm was already destroyed. | |
b5231b01 | 573 | * @umem_odp: the umem to map and pin |
8ada2c1c SR |
574 | * @user_virt: the address from which we need to map. |
575 | * @bcnt: the minimal number of bytes to pin and map. The mapping might be | |
576 | * bigger due to alignment, and may also be smaller in case of an error | |
577 | * pinning or mapping a page. The actual pages mapped is returned in | |
578 | * the return value. | |
579 | * @access_mask: bit mask of the requested access permissions for the given | |
580 | * range. | |
581 | * @current_seq: the MMU notifiers sequance value for synchronization with | |
582 | * invalidations. the sequance number is read from | |
b5231b01 | 583 | * umem_odp->notifiers_seq before calling this function |
8ada2c1c | 584 | */ |
b5231b01 JG |
585 | int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, |
586 | u64 bcnt, u64 access_mask, | |
587 | unsigned long current_seq) | |
8ada2c1c | 588 | { |
41b4deea | 589 | struct ib_umem *umem = &umem_odp->umem; |
8ada2c1c | 590 | struct task_struct *owning_process = NULL; |
f27a0d50 | 591 | struct mm_struct *owning_mm = umem_odp->umem.owning_mm; |
8ada2c1c | 592 | struct page **local_page_list = NULL; |
403cd12e AK |
593 | u64 page_mask, off; |
594 | int j, k, ret = 0, start_idx, npages = 0, page_shift; | |
9beae1ea | 595 | unsigned int flags = 0; |
403cd12e | 596 | phys_addr_t p = 0; |
8ada2c1c SR |
597 | |
598 | if (access_mask == 0) | |
599 | return -EINVAL; | |
600 | ||
601 | if (user_virt < ib_umem_start(umem) || | |
602 | user_virt + bcnt > ib_umem_end(umem)) | |
603 | return -EFAULT; | |
604 | ||
605 | local_page_list = (struct page **)__get_free_page(GFP_KERNEL); | |
606 | if (!local_page_list) | |
607 | return -ENOMEM; | |
608 | ||
403cd12e AK |
609 | page_shift = umem->page_shift; |
610 | page_mask = ~(BIT(page_shift) - 1); | |
611 | off = user_virt & (~page_mask); | |
612 | user_virt = user_virt & page_mask; | |
8ada2c1c SR |
613 | bcnt += off; /* Charge for the first page offset as well. */ |
614 | ||
f27a0d50 JG |
615 | /* |
616 | * owning_process is allowed to be NULL, this means somehow the mm is | |
617 | * existing beyond the lifetime of the originating process.. Presumably | |
618 | * mmget_not_zero will fail in this case. | |
619 | */ | |
620 | owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); | |
621 | if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { | |
8ada2c1c | 622 | ret = -EINVAL; |
8ada2c1c SR |
623 | goto out_put_task; |
624 | } | |
625 | ||
9beae1ea LS |
626 | if (access_mask & ODP_WRITE_ALLOWED_BIT) |
627 | flags |= FOLL_WRITE; | |
628 | ||
403cd12e | 629 | start_idx = (user_virt - ib_umem_start(umem)) >> page_shift; |
8ada2c1c SR |
630 | k = start_idx; |
631 | ||
632 | while (bcnt > 0) { | |
403cd12e AK |
633 | const size_t gup_num_pages = min_t(size_t, |
634 | (bcnt + BIT(page_shift) - 1) >> page_shift, | |
635 | PAGE_SIZE / sizeof(struct page *)); | |
8ada2c1c SR |
636 | |
637 | down_read(&owning_mm->mmap_sem); | |
638 | /* | |
639 | * Note: this might result in redundent page getting. We can | |
640 | * avoid this by checking dma_list to be 0 before calling | |
641 | * get_user_pages. However, this make the code much more | |
642 | * complex (and doesn't gain us much performance in most use | |
643 | * cases). | |
644 | */ | |
1e987790 DH |
645 | npages = get_user_pages_remote(owning_process, owning_mm, |
646 | user_virt, gup_num_pages, | |
5b56d49f | 647 | flags, local_page_list, NULL, NULL); |
8ada2c1c SR |
648 | up_read(&owning_mm->mmap_sem); |
649 | ||
650 | if (npages < 0) | |
651 | break; | |
652 | ||
653 | bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); | |
b5231b01 | 654 | mutex_lock(&umem_odp->umem_mutex); |
403cd12e AK |
655 | for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { |
656 | if (user_virt & ~page_mask) { | |
657 | p += PAGE_SIZE; | |
658 | if (page_to_phys(local_page_list[j]) != p) { | |
659 | ret = -EFAULT; | |
660 | break; | |
661 | } | |
662 | put_page(local_page_list[j]); | |
663 | continue; | |
664 | } | |
665 | ||
8ada2c1c | 666 | ret = ib_umem_odp_map_dma_single_page( |
b5231b01 | 667 | umem_odp, k, local_page_list[j], |
403cd12e | 668 | access_mask, current_seq); |
8ada2c1c SR |
669 | if (ret < 0) |
670 | break; | |
403cd12e AK |
671 | |
672 | p = page_to_phys(local_page_list[j]); | |
8ada2c1c SR |
673 | k++; |
674 | } | |
b5231b01 | 675 | mutex_unlock(&umem_odp->umem_mutex); |
8ada2c1c SR |
676 | |
677 | if (ret < 0) { | |
678 | /* Release left over pages when handling errors. */ | |
679 | for (++j; j < npages; ++j) | |
680 | put_page(local_page_list[j]); | |
681 | break; | |
682 | } | |
683 | } | |
684 | ||
685 | if (ret >= 0) { | |
686 | if (npages < 0 && k == start_idx) | |
687 | ret = npages; | |
688 | else | |
689 | ret = k - start_idx; | |
690 | } | |
691 | ||
692 | mmput(owning_mm); | |
693 | out_put_task: | |
f27a0d50 JG |
694 | if (owning_process) |
695 | put_task_struct(owning_process); | |
8ada2c1c SR |
696 | free_page((unsigned long)local_page_list); |
697 | return ret; | |
698 | } | |
699 | EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); | |
700 | ||
b5231b01 | 701 | void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, |
8ada2c1c SR |
702 | u64 bound) |
703 | { | |
41b4deea | 704 | struct ib_umem *umem = &umem_odp->umem; |
8ada2c1c SR |
705 | int idx; |
706 | u64 addr; | |
707 | struct ib_device *dev = umem->context->device; | |
708 | ||
709 | virt = max_t(u64, virt, ib_umem_start(umem)); | |
710 | bound = min_t(u64, bound, ib_umem_end(umem)); | |
882214e2 HE |
711 | /* Note that during the run of this function, the |
712 | * notifiers_count of the MR is > 0, preventing any racing | |
713 | * faults from completion. We might be racing with other | |
714 | * invalidations, so we must make sure we free each page only | |
715 | * once. */ | |
b5231b01 | 716 | mutex_lock(&umem_odp->umem_mutex); |
3e7e1193 | 717 | for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { |
403cd12e | 718 | idx = (addr - ib_umem_start(umem)) >> umem->page_shift; |
b5231b01 JG |
719 | if (umem_odp->page_list[idx]) { |
720 | struct page *page = umem_odp->page_list[idx]; | |
721 | dma_addr_t dma = umem_odp->dma_list[idx]; | |
8ada2c1c SR |
722 | dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; |
723 | ||
724 | WARN_ON(!dma_addr); | |
725 | ||
726 | ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, | |
727 | DMA_BIDIRECTIONAL); | |
325ad061 GS |
728 | if (dma & ODP_WRITE_ALLOWED_BIT) { |
729 | struct page *head_page = compound_head(page); | |
882214e2 HE |
730 | /* |
731 | * set_page_dirty prefers being called with | |
732 | * the page lock. However, MMU notifiers are | |
733 | * called sometimes with and sometimes without | |
734 | * the lock. We rely on the umem_mutex instead | |
735 | * to prevent other mmu notifiers from | |
736 | * continuing and allowing the page mapping to | |
737 | * be removed. | |
738 | */ | |
739 | set_page_dirty(head_page); | |
325ad061 | 740 | } |
882214e2 HE |
741 | /* on demand pinning support */ |
742 | if (!umem->context->invalidate_range) | |
743 | put_page(page); | |
b5231b01 JG |
744 | umem_odp->page_list[idx] = NULL; |
745 | umem_odp->dma_list[idx] = 0; | |
d07d1d70 | 746 | umem->npages--; |
8ada2c1c | 747 | } |
8ada2c1c | 748 | } |
b5231b01 | 749 | mutex_unlock(&umem_odp->umem_mutex); |
8ada2c1c SR |
750 | } |
751 | EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); | |
fec99ede LR |
752 | |
753 | /* @last is not a part of the interval. See comment for function | |
754 | * node_last. | |
755 | */ | |
756 | int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, | |
757 | u64 start, u64 last, | |
758 | umem_call_back cb, | |
93065ac7 | 759 | bool blockable, |
fec99ede LR |
760 | void *cookie) |
761 | { | |
762 | int ret_val = 0; | |
763 | struct umem_odp_node *node, *next; | |
764 | struct ib_umem_odp *umem; | |
765 | ||
766 | if (unlikely(start == last)) | |
767 | return ret_val; | |
768 | ||
769 | for (node = rbt_ib_umem_iter_first(root, start, last - 1); | |
770 | node; node = next) { | |
93065ac7 MH |
771 | /* TODO move the blockable decision up to the callback */ |
772 | if (!blockable) | |
773 | return -EAGAIN; | |
fec99ede LR |
774 | next = rbt_ib_umem_iter_next(node, start, last - 1); |
775 | umem = container_of(node, struct ib_umem_odp, interval_tree); | |
b5231b01 | 776 | ret_val = cb(umem, start, last, cookie) || ret_val; |
fec99ede LR |
777 | } |
778 | ||
779 | return ret_val; | |
780 | } | |
781 | EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); | |
782 | ||
783 | struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, | |
784 | u64 addr, u64 length) | |
785 | { | |
786 | struct umem_odp_node *node; | |
787 | ||
788 | node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); | |
789 | if (node) | |
790 | return container_of(node, struct ib_umem_odp, interval_tree); | |
791 | return NULL; | |
792 | ||
793 | } | |
794 | EXPORT_SYMBOL(rbt_ib_umem_lookup); |