]> Git Repo - linux.git/blob - drivers/iommu/amd/io_pgtable.c
Merge patch series "riscv: Extension parsing fixes"
[linux.git] / drivers / iommu / amd / io_pgtable.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * CPU-agnostic AMD IO page table allocator.
4  *
5  * Copyright (C) 2020 Advanced Micro Devices, Inc.
6  * Author: Suravee Suthikulpanit <[email protected]>
7  */
8
9 #define pr_fmt(fmt)     "AMD-Vi: " fmt
10 #define dev_fmt(fmt)    pr_fmt(fmt)
11
12 #include <linux/atomic.h>
13 #include <linux/bitops.h>
14 #include <linux/io-pgtable.h>
15 #include <linux/kernel.h>
16 #include <linux/sizes.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/dma-mapping.h>
20
21 #include <asm/barrier.h>
22
23 #include "amd_iommu_types.h"
24 #include "amd_iommu.h"
25 #include "../iommu-pages.h"
26
27 static void v1_tlb_flush_all(void *cookie)
28 {
29 }
30
31 static void v1_tlb_flush_walk(unsigned long iova, size_t size,
32                                   size_t granule, void *cookie)
33 {
34 }
35
36 static void v1_tlb_add_page(struct iommu_iotlb_gather *gather,
37                                          unsigned long iova, size_t granule,
38                                          void *cookie)
39 {
40 }
41
42 static const struct iommu_flush_ops v1_flush_ops = {
43         .tlb_flush_all  = v1_tlb_flush_all,
44         .tlb_flush_walk = v1_tlb_flush_walk,
45         .tlb_add_page   = v1_tlb_add_page,
46 };
47
48 /*
49  * Helper function to get the first pte of a large mapping
50  */
51 static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
52                          unsigned long *count)
53 {
54         unsigned long pte_mask, pg_size, cnt;
55         u64 *fpte;
56
57         pg_size  = PTE_PAGE_SIZE(*pte);
58         cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
59         pte_mask = ~((cnt << 3) - 1);
60         fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
61
62         if (page_size)
63                 *page_size = pg_size;
64
65         if (count)
66                 *count = cnt;
67
68         return fpte;
69 }
70
71 /****************************************************************************
72  *
73  * The functions below are used the create the page table mappings for
74  * unity mapped regions.
75  *
76  ****************************************************************************/
77
78 static void free_pt_page(u64 *pt, struct list_head *freelist)
79 {
80         struct page *p = virt_to_page(pt);
81
82         list_add_tail(&p->lru, freelist);
83 }
84
85 static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl)
86 {
87         u64 *p;
88         int i;
89
90         for (i = 0; i < 512; ++i) {
91                 /* PTE present? */
92                 if (!IOMMU_PTE_PRESENT(pt[i]))
93                         continue;
94
95                 /* Large PTE? */
96                 if (PM_PTE_LEVEL(pt[i]) == 0 ||
97                     PM_PTE_LEVEL(pt[i]) == 7)
98                         continue;
99
100                 /*
101                  * Free the next level. No need to look at l1 tables here since
102                  * they can only contain leaf PTEs; just free them directly.
103                  */
104                 p = IOMMU_PTE_PAGE(pt[i]);
105                 if (lvl > 2)
106                         free_pt_lvl(p, freelist, lvl - 1);
107                 else
108                         free_pt_page(p, freelist);
109         }
110
111         free_pt_page(pt, freelist);
112 }
113
114 static void free_sub_pt(u64 *root, int mode, struct list_head *freelist)
115 {
116         switch (mode) {
117         case PAGE_MODE_NONE:
118         case PAGE_MODE_7_LEVEL:
119                 break;
120         case PAGE_MODE_1_LEVEL:
121                 free_pt_page(root, freelist);
122                 break;
123         case PAGE_MODE_2_LEVEL:
124         case PAGE_MODE_3_LEVEL:
125         case PAGE_MODE_4_LEVEL:
126         case PAGE_MODE_5_LEVEL:
127         case PAGE_MODE_6_LEVEL:
128                 free_pt_lvl(root, freelist, mode);
129                 break;
130         default:
131                 BUG();
132         }
133 }
134
135 void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
136                                   u64 *root, int mode)
137 {
138         u64 pt_root;
139
140         /* lowest 3 bits encode pgtable mode */
141         pt_root = mode & 7;
142         pt_root |= (u64)root;
143
144         amd_iommu_domain_set_pt_root(domain, pt_root);
145 }
146
147 /*
148  * This function is used to add another level to an IO page table. Adding
149  * another level increases the size of the address space by 9 bits to a size up
150  * to 64 bits.
151  */
152 static bool increase_address_space(struct protection_domain *domain,
153                                    unsigned long address,
154                                    gfp_t gfp)
155 {
156         unsigned long flags;
157         bool ret = true;
158         u64 *pte;
159
160         pte = iommu_alloc_page_node(domain->nid, gfp);
161         if (!pte)
162                 return false;
163
164         spin_lock_irqsave(&domain->lock, flags);
165
166         if (address <= PM_LEVEL_SIZE(domain->iop.mode))
167                 goto out;
168
169         ret = false;
170         if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL))
171                 goto out;
172
173         *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root));
174
175         domain->iop.root  = pte;
176         domain->iop.mode += 1;
177         amd_iommu_update_and_flush_device_table(domain);
178         amd_iommu_domain_flush_complete(domain);
179
180         /*
181          * Device Table needs to be updated and flushed before the new root can
182          * be published.
183          */
184         amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode);
185
186         pte = NULL;
187         ret = true;
188
189 out:
190         spin_unlock_irqrestore(&domain->lock, flags);
191         iommu_free_page(pte);
192
193         return ret;
194 }
195
196 static u64 *alloc_pte(struct protection_domain *domain,
197                       unsigned long address,
198                       unsigned long page_size,
199                       u64 **pte_page,
200                       gfp_t gfp,
201                       bool *updated)
202 {
203         int level, end_lvl;
204         u64 *pte, *page;
205
206         BUG_ON(!is_power_of_2(page_size));
207
208         while (address > PM_LEVEL_SIZE(domain->iop.mode)) {
209                 /*
210                  * Return an error if there is no memory to update the
211                  * page-table.
212                  */
213                 if (!increase_address_space(domain, address, gfp))
214                         return NULL;
215         }
216
217
218         level   = domain->iop.mode - 1;
219         pte     = &domain->iop.root[PM_LEVEL_INDEX(level, address)];
220         address = PAGE_SIZE_ALIGN(address, page_size);
221         end_lvl = PAGE_SIZE_LEVEL(page_size);
222
223         while (level > end_lvl) {
224                 u64 __pte, __npte;
225                 int pte_level;
226
227                 __pte     = *pte;
228                 pte_level = PM_PTE_LEVEL(__pte);
229
230                 /*
231                  * If we replace a series of large PTEs, we need
232                  * to tear down all of them.
233                  */
234                 if (IOMMU_PTE_PRESENT(__pte) &&
235                     pte_level == PAGE_MODE_7_LEVEL) {
236                         unsigned long count, i;
237                         u64 *lpte;
238
239                         lpte = first_pte_l7(pte, NULL, &count);
240
241                         /*
242                          * Unmap the replicated PTEs that still match the
243                          * original large mapping
244                          */
245                         for (i = 0; i < count; ++i)
246                                 cmpxchg64(&lpte[i], __pte, 0ULL);
247
248                         *updated = true;
249                         continue;
250                 }
251
252                 if (!IOMMU_PTE_PRESENT(__pte) ||
253                     pte_level == PAGE_MODE_NONE) {
254                         page = iommu_alloc_page_node(domain->nid, gfp);
255
256                         if (!page)
257                                 return NULL;
258
259                         __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
260
261                         /* pte could have been changed somewhere. */
262                         if (!try_cmpxchg64(pte, &__pte, __npte))
263                                 iommu_free_page(page);
264                         else if (IOMMU_PTE_PRESENT(__pte))
265                                 *updated = true;
266
267                         continue;
268                 }
269
270                 /* No level skipping support yet */
271                 if (pte_level != level)
272                         return NULL;
273
274                 level -= 1;
275
276                 pte = IOMMU_PTE_PAGE(__pte);
277
278                 if (pte_page && level == end_lvl)
279                         *pte_page = pte;
280
281                 pte = &pte[PM_LEVEL_INDEX(level, address)];
282         }
283
284         return pte;
285 }
286
287 /*
288  * This function checks if there is a PTE for a given dma address. If
289  * there is one, it returns the pointer to it.
290  */
291 static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
292                       unsigned long address,
293                       unsigned long *page_size)
294 {
295         int level;
296         u64 *pte;
297
298         *page_size = 0;
299
300         if (address > PM_LEVEL_SIZE(pgtable->mode))
301                 return NULL;
302
303         level      =  pgtable->mode - 1;
304         pte        = &pgtable->root[PM_LEVEL_INDEX(level, address)];
305         *page_size =  PTE_LEVEL_PAGE_SIZE(level);
306
307         while (level > 0) {
308
309                 /* Not Present */
310                 if (!IOMMU_PTE_PRESENT(*pte))
311                         return NULL;
312
313                 /* Large PTE */
314                 if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
315                     PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
316                         break;
317
318                 /* No level skipping support yet */
319                 if (PM_PTE_LEVEL(*pte) != level)
320                         return NULL;
321
322                 level -= 1;
323
324                 /* Walk to the next level */
325                 pte        = IOMMU_PTE_PAGE(*pte);
326                 pte        = &pte[PM_LEVEL_INDEX(level, address)];
327                 *page_size = PTE_LEVEL_PAGE_SIZE(level);
328         }
329
330         /*
331          * If we have a series of large PTEs, make
332          * sure to return a pointer to the first one.
333          */
334         if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
335                 pte = first_pte_l7(pte, page_size, NULL);
336
337         return pte;
338 }
339
340 static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist)
341 {
342         u64 *pt;
343         int mode;
344
345         while (!try_cmpxchg64(pte, &pteval, 0))
346                 pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
347
348         if (!IOMMU_PTE_PRESENT(pteval))
349                 return;
350
351         pt   = IOMMU_PTE_PAGE(pteval);
352         mode = IOMMU_PTE_MODE(pteval);
353
354         free_sub_pt(pt, mode, freelist);
355 }
356
357 /*
358  * Generic mapping functions. It maps a physical address into a DMA
359  * address space. It allocates the page table pages if necessary.
360  * In the future it can be extended to a generic mapping function
361  * supporting all features of AMD IOMMU page tables like level skipping
362  * and full 64 bit address spaces.
363  */
364 static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
365                               phys_addr_t paddr, size_t pgsize, size_t pgcount,
366                               int prot, gfp_t gfp, size_t *mapped)
367 {
368         struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
369         LIST_HEAD(freelist);
370         bool updated = false;
371         u64 __pte, *pte;
372         int ret, i, count;
373         size_t size = pgcount << __ffs(pgsize);
374         unsigned long o_iova = iova;
375
376         BUG_ON(!IS_ALIGNED(iova, pgsize));
377         BUG_ON(!IS_ALIGNED(paddr, pgsize));
378
379         ret = -EINVAL;
380         if (!(prot & IOMMU_PROT_MASK))
381                 goto out;
382
383         while (pgcount > 0) {
384                 count = PAGE_SIZE_PTE_COUNT(pgsize);
385                 pte   = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated);
386
387                 ret = -ENOMEM;
388                 if (!pte)
389                         goto out;
390
391                 for (i = 0; i < count; ++i)
392                         free_clear_pte(&pte[i], pte[i], &freelist);
393
394                 if (!list_empty(&freelist))
395                         updated = true;
396
397                 if (count > 1) {
398                         __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
399                         __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
400                 } else
401                         __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
402
403                 if (prot & IOMMU_PROT_IR)
404                         __pte |= IOMMU_PTE_IR;
405                 if (prot & IOMMU_PROT_IW)
406                         __pte |= IOMMU_PTE_IW;
407
408                 for (i = 0; i < count; ++i)
409                         pte[i] = __pte;
410
411                 iova  += pgsize;
412                 paddr += pgsize;
413                 pgcount--;
414                 if (mapped)
415                         *mapped += pgsize;
416         }
417
418         ret = 0;
419
420 out:
421         if (updated) {
422                 unsigned long flags;
423
424                 spin_lock_irqsave(&dom->lock, flags);
425                 /*
426                  * Flush domain TLB(s) and wait for completion. Any Device-Table
427                  * Updates and flushing already happened in
428                  * increase_address_space().
429                  */
430                 amd_iommu_domain_flush_pages(dom, o_iova, size);
431                 spin_unlock_irqrestore(&dom->lock, flags);
432         }
433
434         /* Everything flushed out, free pages now */
435         iommu_put_pages_list(&freelist);
436
437         return ret;
438 }
439
440 static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
441                                           unsigned long iova,
442                                           size_t pgsize, size_t pgcount,
443                                           struct iommu_iotlb_gather *gather)
444 {
445         struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
446         unsigned long long unmapped;
447         unsigned long unmap_size;
448         u64 *pte;
449         size_t size = pgcount << __ffs(pgsize);
450
451         BUG_ON(!is_power_of_2(pgsize));
452
453         unmapped = 0;
454
455         while (unmapped < size) {
456                 pte = fetch_pte(pgtable, iova, &unmap_size);
457                 if (pte) {
458                         int i, count;
459
460                         count = PAGE_SIZE_PTE_COUNT(unmap_size);
461                         for (i = 0; i < count; i++)
462                                 pte[i] = 0ULL;
463                 } else {
464                         return unmapped;
465                 }
466
467                 iova = (iova & ~(unmap_size - 1)) + unmap_size;
468                 unmapped += unmap_size;
469         }
470
471         return unmapped;
472 }
473
474 static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
475 {
476         struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
477         unsigned long offset_mask, pte_pgsize;
478         u64 *pte, __pte;
479
480         pte = fetch_pte(pgtable, iova, &pte_pgsize);
481
482         if (!pte || !IOMMU_PTE_PRESENT(*pte))
483                 return 0;
484
485         offset_mask = pte_pgsize - 1;
486         __pte       = __sme_clr(*pte & PM_ADDR_MASK);
487
488         return (__pte & ~offset_mask) | (iova & offset_mask);
489 }
490
491 static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
492                                      unsigned long flags)
493 {
494         bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
495         bool dirty = false;
496         int i, count;
497
498         /*
499          * 2.2.3.2 Host Dirty Support
500          * When a non-default page size is used , software must OR the
501          * Dirty bits in all of the replicated host PTEs used to map
502          * the page. The IOMMU does not guarantee the Dirty bits are
503          * set in all of the replicated PTEs. Any portion of the page
504          * may have been written even if the Dirty bit is set in only
505          * one of the replicated PTEs.
506          */
507         count = PAGE_SIZE_PTE_COUNT(size);
508         for (i = 0; i < count && test_only; i++) {
509                 if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
510                         dirty = true;
511                         break;
512                 }
513         }
514
515         for (i = 0; i < count && !test_only; i++) {
516                 if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
517                                        (unsigned long *)&ptep[i])) {
518                         dirty = true;
519                 }
520         }
521
522         return dirty;
523 }
524
525 static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
526                                          unsigned long iova, size_t size,
527                                          unsigned long flags,
528                                          struct iommu_dirty_bitmap *dirty)
529 {
530         struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
531         unsigned long end = iova + size - 1;
532
533         do {
534                 unsigned long pgsize = 0;
535                 u64 *ptep, pte;
536
537                 ptep = fetch_pte(pgtable, iova, &pgsize);
538                 if (ptep)
539                         pte = READ_ONCE(*ptep);
540                 if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
541                         pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
542                         iova += pgsize;
543                         continue;
544                 }
545
546                 /*
547                  * Mark the whole IOVA range as dirty even if only one of
548                  * the replicated PTEs were marked dirty.
549                  */
550                 if (pte_test_and_clear_dirty(ptep, pgsize, flags))
551                         iommu_dirty_bitmap_record(dirty, iova, pgsize);
552                 iova += pgsize;
553         } while (iova < end);
554
555         return 0;
556 }
557
558 /*
559  * ----------------------------------------------------
560  */
561 static void v1_free_pgtable(struct io_pgtable *iop)
562 {
563         struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop);
564         struct protection_domain *dom;
565         LIST_HEAD(freelist);
566
567         if (pgtable->mode == PAGE_MODE_NONE)
568                 return;
569
570         dom = container_of(pgtable, struct protection_domain, iop);
571
572         /* Page-table is not visible to IOMMU anymore, so free it */
573         BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
574                pgtable->mode > PAGE_MODE_6_LEVEL);
575
576         free_sub_pt(pgtable->root, pgtable->mode, &freelist);
577
578         /* Update data structure */
579         amd_iommu_domain_clr_pt_root(dom);
580
581         /* Make changes visible to IOMMUs */
582         amd_iommu_domain_update(dom);
583
584         iommu_put_pages_list(&freelist);
585 }
586
587 static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
588 {
589         struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
590
591         cfg->pgsize_bitmap  = AMD_IOMMU_PGSIZES,
592         cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE,
593         cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE,
594         cfg->tlb            = &v1_flush_ops;
595
596         pgtable->iop.ops.map_pages    = iommu_v1_map_pages;
597         pgtable->iop.ops.unmap_pages  = iommu_v1_unmap_pages;
598         pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
599         pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
600
601         return &pgtable->iop;
602 }
603
604 struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
605         .alloc  = v1_alloc_pgtable,
606         .free   = v1_free_pgtable,
607 };
This page took 0.069165 seconds and 4 git commands to generate.