]> Git Repo - J-linux.git/blob - drivers/infiniband/hw/hfi1/user_exp_rcv.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / drivers / infiniband / hw / hfi1 / user_exp_rcv.c
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3  * Copyright(c) 2020 Cornelis Networks, Inc.
4  * Copyright(c) 2015-2018 Intel Corporation.
5  */
6 #include <asm/page.h>
7 #include <linux/string.h>
8
9 #include "mmu_rb.h"
10 #include "user_exp_rcv.h"
11 #include "trace.h"
12
13 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
14                             struct exp_tid_set *set,
15                             struct hfi1_filedata *fd);
16 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
17 static int set_rcvarray_entry(struct hfi1_filedata *fd,
18                               struct tid_user_buf *tbuf,
19                               u32 rcventry, struct tid_group *grp,
20                               u16 pageidx, unsigned int npages);
21 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
22                                     struct tid_rb_node *tnode);
23 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
24                               const struct mmu_notifier_range *range,
25                               unsigned long cur_seq);
26 static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
27                                  const struct mmu_notifier_range *range,
28                                  unsigned long cur_seq);
29 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
30                             struct tid_group *grp, u16 count,
31                             u32 *tidlist, unsigned int *tididx,
32                             unsigned int *pmapped);
33 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
34 static void __clear_tid_node(struct hfi1_filedata *fd,
35                              struct tid_rb_node *node);
36 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
37
38 static const struct mmu_interval_notifier_ops tid_mn_ops = {
39         .invalidate = tid_rb_invalidate,
40 };
41 static const struct mmu_interval_notifier_ops tid_cover_ops = {
42         .invalidate = tid_cover_invalidate,
43 };
44
45 /*
46  * Initialize context and file private data needed for Expected
47  * receive caching. This needs to be done after the context has
48  * been configured with the eager/expected RcvEntry counts.
49  */
50 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
51                            struct hfi1_ctxtdata *uctxt)
52 {
53         int ret = 0;
54
55         fd->entry_to_rb = kcalloc(uctxt->expected_count,
56                                   sizeof(struct rb_node *),
57                                   GFP_KERNEL);
58         if (!fd->entry_to_rb)
59                 return -ENOMEM;
60
61         if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
62                 fd->invalid_tid_idx = 0;
63                 fd->invalid_tids = kcalloc(uctxt->expected_count,
64                                            sizeof(*fd->invalid_tids),
65                                            GFP_KERNEL);
66                 if (!fd->invalid_tids) {
67                         kfree(fd->entry_to_rb);
68                         fd->entry_to_rb = NULL;
69                         return -ENOMEM;
70                 }
71                 fd->use_mn = true;
72         }
73
74         /*
75          * PSM does not have a good way to separate, count, and
76          * effectively enforce a limit on RcvArray entries used by
77          * subctxts (when context sharing is used) when TID caching
78          * is enabled. To help with that, we calculate a per-process
79          * RcvArray entry share and enforce that.
80          * If TID caching is not in use, PSM deals with usage on its
81          * own. In that case, we allow any subctxt to take all of the
82          * entries.
83          *
84          * Make sure that we set the tid counts only after successful
85          * init.
86          */
87         spin_lock(&fd->tid_lock);
88         if (uctxt->subctxt_cnt && fd->use_mn) {
89                 u16 remainder;
90
91                 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
92                 remainder = uctxt->expected_count % uctxt->subctxt_cnt;
93                 if (remainder && fd->subctxt < remainder)
94                         fd->tid_limit++;
95         } else {
96                 fd->tid_limit = uctxt->expected_count;
97         }
98         spin_unlock(&fd->tid_lock);
99
100         return ret;
101 }
102
103 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
104 {
105         struct hfi1_ctxtdata *uctxt = fd->uctxt;
106
107         mutex_lock(&uctxt->exp_mutex);
108         if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
109                 unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
110         if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
111                 unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
112         mutex_unlock(&uctxt->exp_mutex);
113
114         kfree(fd->invalid_tids);
115         fd->invalid_tids = NULL;
116
117         kfree(fd->entry_to_rb);
118         fd->entry_to_rb = NULL;
119 }
120
121 /*
122  * Release pinned receive buffer pages.
123  *
124  * @mapped: true if the pages have been DMA mapped. false otherwise.
125  * @idx: Index of the first page to unpin.
126  * @npages: No of pages to unpin.
127  *
128  * If the pages have been DMA mapped (indicated by mapped parameter), their
129  * info will be passed via a struct tid_rb_node. If they haven't been mapped,
130  * their info will be passed via a struct tid_user_buf.
131  */
132 static void unpin_rcv_pages(struct hfi1_filedata *fd,
133                             struct tid_user_buf *tidbuf,
134                             struct tid_rb_node *node,
135                             unsigned int idx,
136                             unsigned int npages,
137                             bool mapped)
138 {
139         struct page **pages;
140         struct hfi1_devdata *dd = fd->uctxt->dd;
141         struct mm_struct *mm;
142
143         if (mapped) {
144                 dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
145                                  node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
146                 pages = &node->pages[idx];
147                 mm = mm_from_tid_node(node);
148         } else {
149                 pages = &tidbuf->pages[idx];
150                 mm = current->mm;
151         }
152         hfi1_release_user_pages(mm, pages, npages, mapped);
153         fd->tid_n_pinned -= npages;
154 }
155
156 /*
157  * Pin receive buffer pages.
158  */
159 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
160 {
161         int pinned;
162         unsigned int npages = tidbuf->npages;
163         unsigned long vaddr = tidbuf->vaddr;
164         struct page **pages = NULL;
165         struct hfi1_devdata *dd = fd->uctxt->dd;
166
167         if (npages > fd->uctxt->expected_count) {
168                 dd_dev_err(dd, "Expected buffer too big\n");
169                 return -EINVAL;
170         }
171
172         /* Allocate the array of struct page pointers needed for pinning */
173         pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
174         if (!pages)
175                 return -ENOMEM;
176
177         /*
178          * Pin all the pages of the user buffer. If we can't pin all the
179          * pages, accept the amount pinned so far and program only that.
180          * User space knows how to deal with partially programmed buffers.
181          */
182         if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
183                 kfree(pages);
184                 return -ENOMEM;
185         }
186
187         pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
188         if (pinned <= 0) {
189                 kfree(pages);
190                 return pinned;
191         }
192         tidbuf->pages = pages;
193         fd->tid_n_pinned += pinned;
194         return pinned;
195 }
196
197 /*
198  * RcvArray entry allocation for Expected Receives is done by the
199  * following algorithm:
200  *
201  * The context keeps 3 lists of groups of RcvArray entries:
202  *   1. List of empty groups - tid_group_list
203  *      This list is created during user context creation and
204  *      contains elements which describe sets (of 8) of empty
205  *      RcvArray entries.
206  *   2. List of partially used groups - tid_used_list
207  *      This list contains sets of RcvArray entries which are
208  *      not completely used up. Another mapping request could
209  *      use some of all of the remaining entries.
210  *   3. List of full groups - tid_full_list
211  *      This is the list where sets that are completely used
212  *      up go.
213  *
214  * An attempt to optimize the usage of RcvArray entries is
215  * made by finding all sets of physically contiguous pages in a
216  * user's buffer.
217  * These physically contiguous sets are further split into
218  * sizes supported by the receive engine of the HFI. The
219  * resulting sets of pages are stored in struct tid_pageset,
220  * which describes the sets as:
221  *    * .count - number of pages in this set
222  *    * .idx - starting index into struct page ** array
223  *                    of this set
224  *
225  * From this point on, the algorithm deals with the page sets
226  * described above. The number of pagesets is divided by the
227  * RcvArray group size to produce the number of full groups
228  * needed.
229  *
230  * Groups from the 3 lists are manipulated using the following
231  * rules:
232  *   1. For each set of 8 pagesets, a complete group from
233  *      tid_group_list is taken, programmed, and moved to
234  *      the tid_full_list list.
235  *   2. For all remaining pagesets:
236  *      2.1 If the tid_used_list is empty and the tid_group_list
237  *          is empty, stop processing pageset and return only
238  *          what has been programmed up to this point.
239  *      2.2 If the tid_used_list is empty and the tid_group_list
240  *          is not empty, move a group from tid_group_list to
241  *          tid_used_list.
242  *      2.3 For each group is tid_used_group, program as much as
243  *          can fit into the group. If the group becomes fully
244  *          used, move it to tid_full_list.
245  */
246 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
247                             struct hfi1_tid_info *tinfo)
248 {
249         int ret = 0, need_group = 0, pinned;
250         struct hfi1_ctxtdata *uctxt = fd->uctxt;
251         struct hfi1_devdata *dd = uctxt->dd;
252         unsigned int ngroups, pageset_count,
253                 tididx = 0, mapped, mapped_pages = 0;
254         u32 *tidlist = NULL;
255         struct tid_user_buf *tidbuf;
256         unsigned long mmu_seq = 0;
257
258         if (!PAGE_ALIGNED(tinfo->vaddr))
259                 return -EINVAL;
260         if (tinfo->length == 0)
261                 return -EINVAL;
262
263         tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
264         if (!tidbuf)
265                 return -ENOMEM;
266
267         mutex_init(&tidbuf->cover_mutex);
268         tidbuf->vaddr = tinfo->vaddr;
269         tidbuf->length = tinfo->length;
270         tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length);
271         tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
272                                 GFP_KERNEL);
273         if (!tidbuf->psets) {
274                 ret = -ENOMEM;
275                 goto fail_release_mem;
276         }
277
278         if (fd->use_mn) {
279                 ret = mmu_interval_notifier_insert(
280                         &tidbuf->notifier, current->mm,
281                         tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
282                         &tid_cover_ops);
283                 if (ret)
284                         goto fail_release_mem;
285                 mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
286         }
287
288         pinned = pin_rcv_pages(fd, tidbuf);
289         if (pinned <= 0) {
290                 ret = (pinned < 0) ? pinned : -ENOSPC;
291                 goto fail_unpin;
292         }
293
294         /* Find sets of physically contiguous pages */
295         tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
296
297         /* Reserve the number of expected tids to be used. */
298         spin_lock(&fd->tid_lock);
299         if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
300                 pageset_count = fd->tid_limit - fd->tid_used;
301         else
302                 pageset_count = tidbuf->n_psets;
303         fd->tid_used += pageset_count;
304         spin_unlock(&fd->tid_lock);
305
306         if (!pageset_count) {
307                 ret = -ENOSPC;
308                 goto fail_unreserve;
309         }
310
311         ngroups = pageset_count / dd->rcv_entries.group_size;
312         tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
313         if (!tidlist) {
314                 ret = -ENOMEM;
315                 goto fail_unreserve;
316         }
317
318         tididx = 0;
319
320         /*
321          * From this point on, we are going to be using shared (between master
322          * and subcontexts) context resources. We need to take the lock.
323          */
324         mutex_lock(&uctxt->exp_mutex);
325         /*
326          * The first step is to program the RcvArray entries which are complete
327          * groups.
328          */
329         while (ngroups && uctxt->tid_group_list.count) {
330                 struct tid_group *grp =
331                         tid_group_pop(&uctxt->tid_group_list);
332
333                 ret = program_rcvarray(fd, tidbuf, grp,
334                                        dd->rcv_entries.group_size,
335                                        tidlist, &tididx, &mapped);
336                 /*
337                  * If there was a failure to program the RcvArray
338                  * entries for the entire group, reset the grp fields
339                  * and add the grp back to the free group list.
340                  */
341                 if (ret <= 0) {
342                         tid_group_add_tail(grp, &uctxt->tid_group_list);
343                         hfi1_cdbg(TID,
344                                   "Failed to program RcvArray group %d", ret);
345                         goto unlock;
346                 }
347
348                 tid_group_add_tail(grp, &uctxt->tid_full_list);
349                 ngroups--;
350                 mapped_pages += mapped;
351         }
352
353         while (tididx < pageset_count) {
354                 struct tid_group *grp, *ptr;
355                 /*
356                  * If we don't have any partially used tid groups, check
357                  * if we have empty groups. If so, take one from there and
358                  * put in the partially used list.
359                  */
360                 if (!uctxt->tid_used_list.count || need_group) {
361                         if (!uctxt->tid_group_list.count)
362                                 goto unlock;
363
364                         grp = tid_group_pop(&uctxt->tid_group_list);
365                         tid_group_add_tail(grp, &uctxt->tid_used_list);
366                         need_group = 0;
367                 }
368                 /*
369                  * There is an optimization opportunity here - instead of
370                  * fitting as many page sets as we can, check for a group
371                  * later on in the list that could fit all of them.
372                  */
373                 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
374                                          list) {
375                         unsigned use = min_t(unsigned, pageset_count - tididx,
376                                              grp->size - grp->used);
377
378                         ret = program_rcvarray(fd, tidbuf, grp,
379                                                use, tidlist,
380                                                &tididx, &mapped);
381                         if (ret < 0) {
382                                 hfi1_cdbg(TID,
383                                           "Failed to program RcvArray entries %d",
384                                           ret);
385                                 goto unlock;
386                         } else if (ret > 0) {
387                                 if (grp->used == grp->size)
388                                         tid_group_move(grp,
389                                                        &uctxt->tid_used_list,
390                                                        &uctxt->tid_full_list);
391                                 mapped_pages += mapped;
392                                 need_group = 0;
393                                 /* Check if we are done so we break out early */
394                                 if (tididx >= pageset_count)
395                                         break;
396                         } else if (WARN_ON(ret == 0)) {
397                                 /*
398                                  * If ret is 0, we did not program any entries
399                                  * into this group, which can only happen if
400                                  * we've screwed up the accounting somewhere.
401                                  * Warn and try to continue.
402                                  */
403                                 need_group = 1;
404                         }
405                 }
406         }
407 unlock:
408         mutex_unlock(&uctxt->exp_mutex);
409         hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
410                   mapped_pages, ret);
411
412         /* fail if nothing was programmed, set error if none provided */
413         if (tididx == 0) {
414                 if (ret >= 0)
415                         ret = -ENOSPC;
416                 goto fail_unreserve;
417         }
418
419         /* adjust reserved tid_used to actual count */
420         spin_lock(&fd->tid_lock);
421         fd->tid_used -= pageset_count - tididx;
422         spin_unlock(&fd->tid_lock);
423
424         /* unpin all pages not covered by a TID */
425         unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
426                         false);
427
428         if (fd->use_mn) {
429                 /* check for an invalidate during setup */
430                 bool fail = false;
431
432                 mutex_lock(&tidbuf->cover_mutex);
433                 fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
434                 mutex_unlock(&tidbuf->cover_mutex);
435
436                 if (fail) {
437                         ret = -EBUSY;
438                         goto fail_unprogram;
439                 }
440         }
441
442         tinfo->tidcnt = tididx;
443         tinfo->length = mapped_pages * PAGE_SIZE;
444
445         if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
446                          tidlist, sizeof(tidlist[0]) * tididx)) {
447                 ret = -EFAULT;
448                 goto fail_unprogram;
449         }
450
451         if (fd->use_mn)
452                 mmu_interval_notifier_remove(&tidbuf->notifier);
453         kfree(tidbuf->pages);
454         kfree(tidbuf->psets);
455         kfree(tidbuf);
456         kfree(tidlist);
457         return 0;
458
459 fail_unprogram:
460         /* unprogram, unmap, and unpin all allocated TIDs */
461         tinfo->tidlist = (unsigned long)tidlist;
462         hfi1_user_exp_rcv_clear(fd, tinfo);
463         tinfo->tidlist = 0;
464         pinned = 0;             /* nothing left to unpin */
465         pageset_count = 0;      /* nothing left reserved */
466 fail_unreserve:
467         spin_lock(&fd->tid_lock);
468         fd->tid_used -= pageset_count;
469         spin_unlock(&fd->tid_lock);
470 fail_unpin:
471         if (fd->use_mn)
472                 mmu_interval_notifier_remove(&tidbuf->notifier);
473         if (pinned > 0)
474                 unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
475 fail_release_mem:
476         kfree(tidbuf->pages);
477         kfree(tidbuf->psets);
478         kfree(tidbuf);
479         kfree(tidlist);
480         return ret;
481 }
482
483 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
484                             struct hfi1_tid_info *tinfo)
485 {
486         int ret = 0;
487         struct hfi1_ctxtdata *uctxt = fd->uctxt;
488         u32 *tidinfo;
489         unsigned tididx;
490
491         if (unlikely(tinfo->tidcnt > fd->tid_used))
492                 return -EINVAL;
493
494         tidinfo = memdup_array_user(u64_to_user_ptr(tinfo->tidlist),
495                                     tinfo->tidcnt, sizeof(tidinfo[0]));
496         if (IS_ERR(tidinfo))
497                 return PTR_ERR(tidinfo);
498
499         mutex_lock(&uctxt->exp_mutex);
500         for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
501                 ret = unprogram_rcvarray(fd, tidinfo[tididx]);
502                 if (ret) {
503                         hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
504                                   ret);
505                         break;
506                 }
507         }
508         spin_lock(&fd->tid_lock);
509         fd->tid_used -= tididx;
510         spin_unlock(&fd->tid_lock);
511         tinfo->tidcnt = tididx;
512         mutex_unlock(&uctxt->exp_mutex);
513
514         kfree(tidinfo);
515         return ret;
516 }
517
518 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
519                               struct hfi1_tid_info *tinfo)
520 {
521         struct hfi1_ctxtdata *uctxt = fd->uctxt;
522         unsigned long *ev = uctxt->dd->events +
523                 (uctxt_offset(uctxt) + fd->subctxt);
524         u32 *array;
525         int ret = 0;
526
527         /*
528          * copy_to_user() can sleep, which will leave the invalid_lock
529          * locked and cause the MMU notifier to be blocked on the lock
530          * for a long time.
531          * Copy the data to a local buffer so we can release the lock.
532          */
533         array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
534         if (!array)
535                 return -EFAULT;
536
537         spin_lock(&fd->invalid_lock);
538         if (fd->invalid_tid_idx) {
539                 memcpy(array, fd->invalid_tids, sizeof(*array) *
540                        fd->invalid_tid_idx);
541                 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
542                        fd->invalid_tid_idx);
543                 tinfo->tidcnt = fd->invalid_tid_idx;
544                 fd->invalid_tid_idx = 0;
545                 /*
546                  * Reset the user flag while still holding the lock.
547                  * Otherwise, PSM can miss events.
548                  */
549                 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
550         } else {
551                 tinfo->tidcnt = 0;
552         }
553         spin_unlock(&fd->invalid_lock);
554
555         if (tinfo->tidcnt) {
556                 if (copy_to_user((void __user *)tinfo->tidlist,
557                                  array, sizeof(*array) * tinfo->tidcnt))
558                         ret = -EFAULT;
559         }
560         kfree(array);
561
562         return ret;
563 }
564
565 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
566 {
567         unsigned pagecount, pageidx, setcount = 0, i;
568         unsigned long pfn, this_pfn;
569         struct page **pages = tidbuf->pages;
570         struct tid_pageset *list = tidbuf->psets;
571
572         if (!npages)
573                 return 0;
574
575         /*
576          * Look for sets of physically contiguous pages in the user buffer.
577          * This will allow us to optimize Expected RcvArray entry usage by
578          * using the bigger supported sizes.
579          */
580         pfn = page_to_pfn(pages[0]);
581         for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
582                 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
583
584                 /*
585                  * If the pfn's are not sequential, pages are not physically
586                  * contiguous.
587                  */
588                 if (this_pfn != ++pfn) {
589                         /*
590                          * At this point we have to loop over the set of
591                          * physically contiguous pages and break them down it
592                          * sizes supported by the HW.
593                          * There are two main constraints:
594                          *     1. The max buffer size is MAX_EXPECTED_BUFFER.
595                          *        If the total set size is bigger than that
596                          *        program only a MAX_EXPECTED_BUFFER chunk.
597                          *     2. The buffer size has to be a power of two. If
598                          *        it is not, round down to the closes power of
599                          *        2 and program that size.
600                          */
601                         while (pagecount) {
602                                 int maxpages = pagecount;
603                                 u32 bufsize = pagecount * PAGE_SIZE;
604
605                                 if (bufsize > MAX_EXPECTED_BUFFER)
606                                         maxpages =
607                                                 MAX_EXPECTED_BUFFER >>
608                                                 PAGE_SHIFT;
609                                 else if (!is_power_of_2(bufsize))
610                                         maxpages =
611                                                 rounddown_pow_of_two(bufsize) >>
612                                                 PAGE_SHIFT;
613
614                                 list[setcount].idx = pageidx;
615                                 list[setcount].count = maxpages;
616                                 pagecount -= maxpages;
617                                 pageidx += maxpages;
618                                 setcount++;
619                         }
620                         pageidx = i;
621                         pagecount = 1;
622                         pfn = this_pfn;
623                 } else {
624                         pagecount++;
625                 }
626         }
627         return setcount;
628 }
629
630 /**
631  * program_rcvarray() - program an RcvArray group with receive buffers
632  * @fd: filedata pointer
633  * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
634  *        virtual address, buffer length, page pointers, pagesets (array of
635  *        struct tid_pageset holding information on physically contiguous
636  *        chunks from the user buffer), and other fields.
637  * @grp: RcvArray group
638  * @count: number of struct tid_pageset's to program
639  * @tidlist: the array of u32 elements when the information about the
640  *           programmed RcvArray entries is to be encoded.
641  * @tididx: starting offset into tidlist
642  * @pmapped: (output parameter) number of pages programmed into the RcvArray
643  *           entries.
644  *
645  * This function will program up to 'count' number of RcvArray entries from the
646  * group 'grp'. To make best use of write-combining writes, the function will
647  * perform writes to the unused RcvArray entries which will be ignored by the
648  * HW. Each RcvArray entry will be programmed with a physically contiguous
649  * buffer chunk from the user's virtual buffer.
650  *
651  * Return:
652  * -EINVAL if the requested count is larger than the size of the group,
653  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
654  * number of RcvArray entries programmed.
655  */
656 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
657                             struct tid_group *grp, u16 count,
658                             u32 *tidlist, unsigned int *tididx,
659                             unsigned int *pmapped)
660 {
661         struct hfi1_ctxtdata *uctxt = fd->uctxt;
662         struct hfi1_devdata *dd = uctxt->dd;
663         u16 idx;
664         unsigned int start = *tididx;
665         u32 tidinfo = 0, rcventry, useidx = 0;
666         int mapped = 0;
667
668         /* Count should never be larger than the group size */
669         if (count > grp->size)
670                 return -EINVAL;
671
672         /* Find the first unused entry in the group */
673         for (idx = 0; idx < grp->size; idx++) {
674                 if (!(grp->map & (1 << idx))) {
675                         useidx = idx;
676                         break;
677                 }
678                 rcv_array_wc_fill(dd, grp->base + idx);
679         }
680
681         idx = 0;
682         while (idx < count) {
683                 u16 npages, pageidx, setidx = start + idx;
684                 int ret = 0;
685
686                 /*
687                  * If this entry in the group is used, move to the next one.
688                  * If we go past the end of the group, exit the loop.
689                  */
690                 if (useidx >= grp->size) {
691                         break;
692                 } else if (grp->map & (1 << useidx)) {
693                         rcv_array_wc_fill(dd, grp->base + useidx);
694                         useidx++;
695                         continue;
696                 }
697
698                 rcventry = grp->base + useidx;
699                 npages = tbuf->psets[setidx].count;
700                 pageidx = tbuf->psets[setidx].idx;
701
702                 ret = set_rcvarray_entry(fd, tbuf,
703                                          rcventry, grp, pageidx,
704                                          npages);
705                 if (ret)
706                         return ret;
707                 mapped += npages;
708
709                 tidinfo = create_tid(rcventry - uctxt->expected_base, npages);
710                 tidlist[(*tididx)++] = tidinfo;
711                 grp->used++;
712                 grp->map |= 1 << useidx++;
713                 idx++;
714         }
715
716         /* Fill the rest of the group with "blank" writes */
717         for (; useidx < grp->size; useidx++)
718                 rcv_array_wc_fill(dd, grp->base + useidx);
719         *pmapped = mapped;
720         return idx;
721 }
722
723 static int set_rcvarray_entry(struct hfi1_filedata *fd,
724                               struct tid_user_buf *tbuf,
725                               u32 rcventry, struct tid_group *grp,
726                               u16 pageidx, unsigned int npages)
727 {
728         int ret;
729         struct hfi1_ctxtdata *uctxt = fd->uctxt;
730         struct tid_rb_node *node;
731         struct hfi1_devdata *dd = uctxt->dd;
732         dma_addr_t phys;
733         struct page **pages = tbuf->pages + pageidx;
734
735         /*
736          * Allocate the node first so we can handle a potential
737          * failure before we've programmed anything.
738          */
739         node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL);
740         if (!node)
741                 return -ENOMEM;
742
743         phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
744                               npages * PAGE_SIZE, DMA_FROM_DEVICE);
745         if (dma_mapping_error(&dd->pcidev->dev, phys)) {
746                 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
747                            phys);
748                 kfree(node);
749                 return -EFAULT;
750         }
751
752         node->fdata = fd;
753         mutex_init(&node->invalidate_mutex);
754         node->phys = page_to_phys(pages[0]);
755         node->npages = npages;
756         node->rcventry = rcventry;
757         node->dma_addr = phys;
758         node->grp = grp;
759         node->freed = false;
760         memcpy(node->pages, pages, flex_array_size(node, pages, npages));
761
762         if (fd->use_mn) {
763                 ret = mmu_interval_notifier_insert(
764                         &node->notifier, current->mm,
765                         tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
766                         &tid_mn_ops);
767                 if (ret)
768                         goto out_unmap;
769         }
770         fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
771
772         hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
773         trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
774                                node->notifier.interval_tree.start, node->phys,
775                                phys);
776         return 0;
777
778 out_unmap:
779         hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
780                   node->rcventry, node->notifier.interval_tree.start,
781                   node->phys, ret);
782         dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
783                          DMA_FROM_DEVICE);
784         kfree(node);
785         return -EFAULT;
786 }
787
788 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
789 {
790         struct hfi1_ctxtdata *uctxt = fd->uctxt;
791         struct hfi1_devdata *dd = uctxt->dd;
792         struct tid_rb_node *node;
793         u32 tidctrl = EXP_TID_GET(tidinfo, CTRL);
794         u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
795
796         if (tidctrl == 0x3 || tidctrl == 0x0)
797                 return -EINVAL;
798
799         rcventry = tididx + (tidctrl - 1);
800
801         if (rcventry >= uctxt->expected_count) {
802                 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
803                            rcventry, uctxt->ctxt);
804                 return -EINVAL;
805         }
806
807         node = fd->entry_to_rb[rcventry];
808         if (!node || node->rcventry != (uctxt->expected_base + rcventry))
809                 return -EBADF;
810
811         if (fd->use_mn)
812                 mmu_interval_notifier_remove(&node->notifier);
813         cacheless_tid_rb_remove(fd, node);
814
815         return 0;
816 }
817
818 static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
819 {
820         struct hfi1_ctxtdata *uctxt = fd->uctxt;
821         struct hfi1_devdata *dd = uctxt->dd;
822
823         mutex_lock(&node->invalidate_mutex);
824         if (node->freed)
825                 goto done;
826         node->freed = true;
827
828         trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
829                                  node->npages,
830                                  node->notifier.interval_tree.start, node->phys,
831                                  node->dma_addr);
832
833         /* Make sure device has seen the write before pages are unpinned */
834         hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
835
836         unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
837 done:
838         mutex_unlock(&node->invalidate_mutex);
839 }
840
841 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
842 {
843         struct hfi1_ctxtdata *uctxt = fd->uctxt;
844
845         __clear_tid_node(fd, node);
846
847         node->grp->used--;
848         node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
849
850         if (node->grp->used == node->grp->size - 1)
851                 tid_group_move(node->grp, &uctxt->tid_full_list,
852                                &uctxt->tid_used_list);
853         else if (!node->grp->used)
854                 tid_group_move(node->grp, &uctxt->tid_used_list,
855                                &uctxt->tid_group_list);
856         kfree(node);
857 }
858
859 /*
860  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
861  * clearing nodes in the non-cached case.
862  */
863 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
864                             struct exp_tid_set *set,
865                             struct hfi1_filedata *fd)
866 {
867         struct tid_group *grp, *ptr;
868         int i;
869
870         list_for_each_entry_safe(grp, ptr, &set->list, list) {
871                 list_del_init(&grp->list);
872
873                 for (i = 0; i < grp->size; i++) {
874                         if (grp->map & (1 << i)) {
875                                 u16 rcventry = grp->base + i;
876                                 struct tid_rb_node *node;
877
878                                 node = fd->entry_to_rb[rcventry -
879                                                           uctxt->expected_base];
880                                 if (!node || node->rcventry != rcventry)
881                                         continue;
882
883                                 if (fd->use_mn)
884                                         mmu_interval_notifier_remove(
885                                                 &node->notifier);
886                                 cacheless_tid_rb_remove(fd, node);
887                         }
888                 }
889         }
890 }
891
892 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
893                               const struct mmu_notifier_range *range,
894                               unsigned long cur_seq)
895 {
896         struct tid_rb_node *node =
897                 container_of(mni, struct tid_rb_node, notifier);
898         struct hfi1_filedata *fdata = node->fdata;
899         struct hfi1_ctxtdata *uctxt = fdata->uctxt;
900
901         if (node->freed)
902                 return true;
903
904         /* take action only if unmapping */
905         if (range->event != MMU_NOTIFY_UNMAP)
906                 return true;
907
908         trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
909                                  node->notifier.interval_tree.start,
910                                  node->rcventry, node->npages, node->dma_addr);
911
912         /* clear the hardware rcvarray entry */
913         __clear_tid_node(fdata, node);
914
915         spin_lock(&fdata->invalid_lock);
916         if (fdata->invalid_tid_idx < uctxt->expected_count) {
917                 fdata->invalid_tids[fdata->invalid_tid_idx] =
918                         create_tid(node->rcventry - uctxt->expected_base,
919                                    node->npages);
920                 if (!fdata->invalid_tid_idx) {
921                         unsigned long *ev;
922
923                         /*
924                          * hfi1_set_uevent_bits() sets a user event flag
925                          * for all processes. Because calling into the
926                          * driver to process TID cache invalidations is
927                          * expensive and TID cache invalidations are
928                          * handled on a per-process basis, we can
929                          * optimize this to set the flag only for the
930                          * process in question.
931                          */
932                         ev = uctxt->dd->events +
933                                 (uctxt_offset(uctxt) + fdata->subctxt);
934                         set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
935                 }
936                 fdata->invalid_tid_idx++;
937         }
938         spin_unlock(&fdata->invalid_lock);
939         return true;
940 }
941
942 static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
943                                  const struct mmu_notifier_range *range,
944                                  unsigned long cur_seq)
945 {
946         struct tid_user_buf *tidbuf =
947                 container_of(mni, struct tid_user_buf, notifier);
948
949         /* take action only if unmapping */
950         if (range->event == MMU_NOTIFY_UNMAP) {
951                 mutex_lock(&tidbuf->cover_mutex);
952                 mmu_interval_set_seq(mni, cur_seq);
953                 mutex_unlock(&tidbuf->cover_mutex);
954         }
955
956         return true;
957 }
958
959 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
960                                     struct tid_rb_node *tnode)
961 {
962         u32 base = fdata->uctxt->expected_base;
963
964         fdata->entry_to_rb[tnode->rcventry - base] = NULL;
965         clear_tid_node(fdata, tnode);
966 }
This page took 0.083786 seconds and 4 git commands to generate.