iommu/arm-smmu-v3: Reduce contention during command-queue insertion

author Will Deacon <[email protected]>

Tue, 2 Jul 2019 16:16:25 +0000 (17:16 +0100)

committer Will Deacon <[email protected]>

Thu, 8 Aug 2019 12:31:54 +0000 (13:31 +0100)
author Will Deacon <[email protected]>
Tue, 2 Jul 2019 16:16:25 +0000 (17:16 +0100)
committer Will Deacon <[email protected]>
Thu, 8 Aug 2019 12:31:54 +0000 (13:31 +0100)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c

index 9ebb8b39a3b120d8f9095be45c975b7ab8465f5a..202b4b6fc70ab51c8b5708d5cb4961f6a13b378c 100644 (file)
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -183,7 +183,7 @@
  
  #define Q_IDX(llq, p)                  ((p) & ((1 << (llq)->max_n_shift) - 1))
  #define Q_WRP(llq, p)                  ((p) & (1 << (llq)->max_n_shift))
-#define Q_OVERFLOW_FLAG                        (1 << 31)
+#define Q_OVERFLOW_FLAG                        (1U << 31)
  #define Q_OVF(p)                       ((p) & Q_OVERFLOW_FLAG)
  #define Q_ENT(q, p)                    ((q)->base +                    \
                                          Q_IDX(&((q)->llq), p) *        \
@@ -307,6 +307,8 @@
  #define CMDQ_ERR_CERROR_ABT_IDX                2
  #define CMDQ_ERR_CERROR_ATC_INV_IDX    3
  
+#define CMDQ_PROD_OWNED_FLAG           Q_OVERFLOW_FLAG
+
  #define CMDQ_0_OP                      GENMASK_ULL(7, 0)
  #define CMDQ_0_SSV                     (1UL << 11)
  
@@ -369,9 +371,8 @@
  #define PRIQ_1_ADDR_MASK               GENMASK_ULL(63, 12)
  
  /* High-level queue structures */
-#define ARM_SMMU_POLL_TIMEOUT_US       100
-#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US  1000000 /* 1s! */
-#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT  10
+#define ARM_SMMU_POLL_TIMEOUT_US       1000000 /* 1s! */
+#define ARM_SMMU_POLL_SPIN_COUNT       10
  
  #define MSI_IOVA_BASE                  0x8000000
  #define MSI_IOVA_LENGTH                        0x100000
@@ -473,15 +474,24 @@ struct arm_smmu_cmdq_ent {
  
                 #define CMDQ_OP_CMD_SYNC        0x46
                 struct {
-                       u32                     msidata;
                         u64                     msiaddr;
                 } sync;
         };
  };
  
  struct arm_smmu_ll_queue {
-       u32                             prod;
-       u32                             cons;
+       union {
+               u64                     val;
+               struct {
+                       u32             prod;
+                       u32             cons;
+               };
+               struct {
+                       atomic_t        prod;
+                       atomic_t        cons;
+               } atomic;
+               u8                      __pad[SMP_CACHE_BYTES];
+       } ____cacheline_aligned_in_smp;
         u32                             max_n_shift;
  };
  
@@ -499,9 +509,18 @@ struct arm_smmu_queue {
         u32 __iomem                     *cons_reg;
  };
  
+struct arm_smmu_queue_poll {
+       ktime_t                         timeout;
+       unsigned int                    delay;
+       unsigned int                    spin_cnt;
+       bool                            wfe;
+};
+
  struct arm_smmu_cmdq {
         struct arm_smmu_queue           q;
-       spinlock_t                      lock;
+       atomic_long_t                   *valid_map;
+       atomic_t                        owner_prod;
+       atomic_t                        lock;
  };
  
  struct arm_smmu_evtq {
@@ -581,8 +600,6 @@ struct arm_smmu_device {
  
         int                             gerr_irq;
         int                             combined_irq;
-       u32                             sync_nr;
-       u8                              prev_cmd_opcode;
  
         unsigned long                   ias; /* IPA */
         unsigned long                   oas; /* PA */
@@ -601,12 +618,6 @@ struct arm_smmu_device {
  
         struct arm_smmu_strtab_cfg      strtab_cfg;
  
-       /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
-       union {
-               u32                     sync_count;
-               u64                     padding;
-       };
-
         /* IOMMU core code handle */
         struct iommu_device             iommu;
  };
@@ -690,6 +701,21 @@ static void parse_driver_options(struct arm_smmu_device *smmu)
  }
  
  /* Low-level queue manipulation functions */
+static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
+{
+       u32 space, prod, cons;
+
+       prod = Q_IDX(q, q->prod);
+       cons = Q_IDX(q, q->cons);
+
+       if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
+               space = (1 << q->max_n_shift) - (prod - cons);
+       else
+               space = cons - prod;
+
+       return space >= n;
+}
+
  static bool queue_full(struct arm_smmu_ll_queue *q)
  {
         return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
@@ -702,9 +728,12 @@ static bool queue_empty(struct arm_smmu_ll_queue *q)
                Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
  }
  
-static void queue_sync_cons_in(struct arm_smmu_queue *q)
+static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
  {
-       q->llq.cons = readl_relaxed(q->cons_reg);
+       return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
+               (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
+              ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
+               (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
  }
  
  static void queue_sync_cons_out(struct arm_smmu_queue *q)
@@ -735,46 +764,34 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q)
         return ret;
  }
  
-static void queue_sync_prod_out(struct arm_smmu_queue *q)
+static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
  {
-       writel(q->llq.prod, q->prod_reg);
+       u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
+       return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
  }
  
-static void queue_inc_prod(struct arm_smmu_ll_queue *q)
+static void queue_poll_init(struct arm_smmu_device *smmu,
+                           struct arm_smmu_queue_poll *qp)
  {
-       u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
-       q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+       qp->delay = 1;
+       qp->spin_cnt = 0;
+       qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+       qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
  }
  
-/*
- * Wait for the SMMU to consume items. If sync is true, wait until the queue
- * is empty. Otherwise, wait until there is at least one free slot.
- */
-static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
+static int queue_poll(struct arm_smmu_queue_poll *qp)
  {
-       ktime_t timeout;
-       unsigned int delay = 1, spin_cnt = 0;
-
-       /* Wait longer if it's a CMD_SYNC */
-       timeout = ktime_add_us(ktime_get(), sync ?
-                                           ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
-                                           ARM_SMMU_POLL_TIMEOUT_US);
-
-       while (queue_sync_cons_in(q),
-             (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) {
-               if (ktime_compare(ktime_get(), timeout) > 0)
-                       return -ETIMEDOUT;
+       if (ktime_compare(ktime_get(), qp->timeout) > 0)
+               return -ETIMEDOUT;
  
-               if (wfe) {
-                       wfe();
-               } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
-                       cpu_relax();
-                       continue;
-               } else {
-                       udelay(delay);
-                       delay *= 2;
-                       spin_cnt = 0;
-               }
+       if (qp->wfe) {
+               wfe();
+       } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
+               cpu_relax();
+       } else {
+               udelay(qp->delay);
+               qp->delay *= 2;
+               qp->spin_cnt = 0;
         }
  
         return 0;
@@ -788,17 +805,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
                 *dst++ = cpu_to_le64(*src++);
  }
  
-static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
-{
-       if (queue_full(&q->llq))
-               return -ENOSPC;
-
-       queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
-       queue_inc_prod(&q->llq);
-       queue_sync_prod_out(q);
-       return 0;
-}
-
  static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
  {
         int i;
@@ -881,20 +887,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
                 cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
                 break;
         case CMDQ_OP_CMD_SYNC:
-               if (ent->sync.msiaddr)
+               if (ent->sync.msiaddr) {
                         cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ);
-               else
+                       cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
+               } else {
                         cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
+               }
                 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
                 cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB);
-               /*
-                * Commands are written little-endian, but we want the SMMU to
-                * receive MSIData, and thus write it back to memory, in CPU
-                * byte order, so big-endian needs an extra byteswap here.
-                */
-               cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
-                                    cpu_to_le32(ent->sync.msidata));
-               cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
                 break;
         default:
                 return -ENOENT;
@@ -903,6 +903,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
         return 0;
  }
  
+static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
+                                        u32 prod)
+{
+       struct arm_smmu_queue *q = &smmu->cmdq.q;
+       struct arm_smmu_cmdq_ent ent = {
+               .opcode = CMDQ_OP_CMD_SYNC,
+       };
+
+       /*
+        * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
+        * payload, so the write will zero the entire command on that platform.
+        */
+       if (smmu->features & ARM_SMMU_FEAT_MSI &&
+           smmu->features & ARM_SMMU_FEAT_COHERENCY) {
+               ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
+                                  q->ent_dwords * 8;
+       }
+
+       arm_smmu_cmdq_build_cmd(cmd, &ent);
+}
+
  static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
  {
         static const char *cerror_str[] = {
@@ -961,109 +982,440 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
         queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
  }
  
-static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
+/*
+ * Command queue locking.
+ * This is a form of bastardised rwlock with the following major changes:
+ *
+ * - The only LOCK routines are exclusive_trylock() and shared_lock().
+ *   Neither have barrier semantics, and instead provide only a control
+ *   dependency.
+ *
+ * - The UNLOCK routines are supplemented with shared_tryunlock(), which
+ *   fails if the caller appears to be the last lock holder (yes, this is
+ *   racy). All successful UNLOCK routines have RELEASE semantics.
+ */
+static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
  {
-       struct arm_smmu_queue *q = &smmu->cmdq.q;
-       bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+       int val;
  
-       smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
+       /*
+        * We can try to avoid the cmpxchg() loop by simply incrementing the
+        * lock counter. When held in exclusive state, the lock counter is set
+        * to INT_MIN so these increments won't hurt as the value will remain
+        * negative.
+        */
+       if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
+               return;
+
+       do {
+               val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
+       } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
+}
+
+static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
+{
+       (void)atomic_dec_return_release(&cmdq->lock);
+}
+
+static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
+{
+       if (atomic_read(&cmdq->lock) == 1)
+               return false;
+
+       arm_smmu_cmdq_shared_unlock(cmdq);
+       return true;
+}
+
+#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)           \
+({                                                                     \
+       bool __ret;                                                     \
+       local_irq_save(flags);                                          \
+       __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN);       \
+       if (!__ret)                                                     \
+               local_irq_restore(flags);                               \
+       __ret;                                                          \
+})
+
+#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags)         \
+({                                                                     \
+       atomic_set_release(&cmdq->lock, 0);                             \
+       local_irq_restore(flags);                                       \
+})
+
+
+/*
+ * Command queue insertion.
+ * This is made fiddly by our attempts to achieve some sort of scalability
+ * since there is one queue shared amongst all of the CPUs in the system.  If
+ * you like mixed-size concurrency, dependency ordering and relaxed atomics,
+ * then you'll *love* this monstrosity.
+ *
+ * The basic idea is to split the queue up into ranges of commands that are
+ * owned by a given CPU; the owner may not have written all of the commands
+ * itself, but is responsible for advancing the hardware prod pointer when
+ * the time comes. The algorithm is roughly:
+ *
+ *     1. Allocate some space in the queue. At this point we also discover
+ *        whether the head of the queue is currently owned by another CPU,
+ *        or whether we are the owner.
+ *
+ *     2. Write our commands into our allocated slots in the queue.
+ *
+ *     3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
+ *
+ *     4. If we are an owner:
+ *             a. Wait for the previous owner to finish.
+ *             b. Mark the queue head as unowned, which tells us the range
+ *                that we are responsible for publishing.
+ *             c. Wait for all commands in our owned range to become valid.
+ *             d. Advance the hardware prod pointer.
+ *             e. Tell the next owner we've finished.
+ *
+ *     5. If we are inserting a CMD_SYNC (we may or may not have been an
+ *        owner), then we need to stick around until it has completed:
+ *             a. If we have MSIs, the SMMU can write back into the CMD_SYNC
+ *                to clear the first 4 bytes.
+ *             b. Otherwise, we spin waiting for the hardware cons pointer to
+ *                advance past our command.
+ *
+ * The devil is in the details, particularly the use of locking for handling
+ * SYNC completion and freeing up space in the queue before we think that it is
+ * full.
+ */
+static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
+                                              u32 sprod, u32 eprod, bool set)
+{
+       u32 swidx, sbidx, ewidx, ebidx;
+       struct arm_smmu_ll_queue llq = {
+               .max_n_shift    = cmdq->q.llq.max_n_shift,
+               .prod           = sprod,
+       };
  
-       while (queue_insert_raw(q, cmd) == -ENOSPC) {
-               if (queue_poll_cons(q, false, wfe))
-                       dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+       ewidx = BIT_WORD(Q_IDX(&llq, eprod));
+       ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
+
+       while (llq.prod != eprod) {
+               unsigned long mask;
+               atomic_long_t *ptr;
+               u32 limit = BITS_PER_LONG;
+
+               swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
+               sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
+
+               ptr = &cmdq->valid_map[swidx];
+
+               if ((swidx == ewidx) && (sbidx < ebidx))
+                       limit = ebidx;
+
+               mask = GENMASK(limit - 1, sbidx);
+
+               /*
+                * The valid bit is the inverse of the wrap bit. This means
+                * that a zero-initialised queue is invalid and, after marking
+                * all entries as valid, they become invalid again when we
+                * wrap.
+                */
+               if (set) {
+                       atomic_long_xor(mask, ptr);
+               } else { /* Poll */
+                       unsigned long valid;
+
+                       valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
+                       atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid);
+               }
+
+               llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
         }
  }
  
-static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
-                                   struct arm_smmu_cmdq_ent *ent)
+/* Mark all entries in the range [sprod, eprod) as valid */
+static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
+                                       u32 sprod, u32 eprod)
+{
+       __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
+}
+
+/* Wait for all entries in the range [sprod, eprod) to become valid */
+static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
+                                        u32 sprod, u32 eprod)
+{
+       __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
+}
+
+/* Wait for the command queue to become non-full */
+static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
+                                            struct arm_smmu_ll_queue *llq)
  {
-       u64 cmd[CMDQ_ENT_DWORDS];
         unsigned long flags;
+       struct arm_smmu_queue_poll qp;
+       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+       int ret = 0;
  
-       if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
-               dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
-                        ent->opcode);
-               return;
+       /*
+        * Try to update our copy of cons by grabbing exclusive cmdq access. If
+        * that fails, spin until somebody else updates it for us.
+        */
+       if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
+               WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
+               arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
+               llq->val = READ_ONCE(cmdq->q.llq.val);
+               return 0;
         }
  
-       spin_lock_irqsave(&smmu->cmdq.lock, flags);
-       arm_smmu_cmdq_insert_cmd(smmu, cmd);
-       spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+       queue_poll_init(smmu, &qp);
+       do {
+               llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+               if (!queue_full(llq))
+                       break;
+
+               ret = queue_poll(&qp);
+       } while (!ret);
+
+       return ret;
  }
  
  /*
- * The difference between val and sync_idx is bounded by the maximum size of
- * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
+ * Wait until the SMMU signals a CMD_SYNC completion MSI.
+ * Must be called with the cmdq lock held in some capacity.
   */
-static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
+                                         struct arm_smmu_ll_queue *llq)
  {
-       ktime_t timeout;
-       u32 val;
+       int ret = 0;
+       struct arm_smmu_queue_poll qp;
+       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+       u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
  
-       timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
-       val = smp_cond_load_acquire(&smmu->sync_count,
-                                   (int)(VAL - sync_idx) >= 0 ||
-                                   !ktime_before(ktime_get(), timeout));
+       queue_poll_init(smmu, &qp);
  
-       return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
+       /*
+        * The MSI won't generate an event, since it's being written back
+        * into the command queue.
+        */
+       qp.wfe = false;
+       smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
+       llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
+       return ret;
  }
  
-static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
+/*
+ * Wait until the SMMU cons index passes llq->prod.
+ * Must be called with the cmdq lock held in some capacity.
+ */
+static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
+                                              struct arm_smmu_ll_queue *llq)
  {
-       u64 cmd[CMDQ_ENT_DWORDS];
-       unsigned long flags;
-       struct arm_smmu_cmdq_ent ent = {
-               .opcode = CMDQ_OP_CMD_SYNC,
-               .sync   = {
-                       .msiaddr = virt_to_phys(&smmu->sync_count),
-               },
-       };
+       struct arm_smmu_queue_poll qp;
+       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+       u32 prod = llq->prod;
+       int ret = 0;
  
-       spin_lock_irqsave(&smmu->cmdq.lock, flags);
+       queue_poll_init(smmu, &qp);
+       llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+       do {
+               if (queue_consumed(llq, prod))
+                       break;
  
-       /* Piggy-back on the previous command if it's a SYNC */
-       if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
-               ent.sync.msidata = smmu->sync_nr;
-       } else {
-               ent.sync.msidata = ++smmu->sync_nr;
-               arm_smmu_cmdq_build_cmd(cmd, &ent);
-               arm_smmu_cmdq_insert_cmd(smmu, cmd);
-       }
+               ret = queue_poll(&qp);
  
-       spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+               /*
+                * This needs to be a readl() so that our subsequent call
+                * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
+                *
+                * Specifically, we need to ensure that we observe all
+                * shared_lock()s by other CMD_SYNCs that share our owner,
+                * so that a failing call to tryunlock() means that we're
+                * the last one out and therefore we can safely advance
+                * cmdq->q.llq.cons. Roughly speaking:
+                *
+                * CPU 0                CPU1                    CPU2 (us)
+                *
+                * if (sync)
+                *      shared_lock();
+                *
+                * dma_wmb();
+                * set_valid_map();
+                *
+                *                      if (owner) {
+                *                              poll_valid_map();
+                *                              <control dependency>
+                *                              writel(prod_reg);
+                *
+                *                                              readl(cons_reg);
+                *                                              tryunlock();
+                *
+                * Requires us to see CPU 0's shared_lock() acquisition.
+                */
+               llq->cons = readl(cmdq->q.cons_reg);
+       } while (!ret);
  
-       return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
+       return ret;
  }
  
-static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
+                                        struct arm_smmu_ll_queue *llq)
  {
-       u64 cmd[CMDQ_ENT_DWORDS];
+       if (smmu->features & ARM_SMMU_FEAT_MSI &&
+           smmu->features & ARM_SMMU_FEAT_COHERENCY)
+               return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+
+       return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+}
+
+static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
+                                       u32 prod, int n)
+{
+       int i;
+       struct arm_smmu_ll_queue llq = {
+               .max_n_shift    = cmdq->q.llq.max_n_shift,
+               .prod           = prod,
+       };
+
+       for (i = 0; i < n; ++i) {
+               u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
+
+               prod = queue_inc_prod_n(&llq, i);
+               queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
+       }
+}
+
+static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+                                      u64 *cmds, int n, bool sync)
+{
+       u64 cmd_sync[CMDQ_ENT_DWORDS];
+       u32 prod;
         unsigned long flags;
-       bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
-       struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
-       int ret;
+       bool owner;
+       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+       struct arm_smmu_ll_queue llq = {
+               .max_n_shift = cmdq->q.llq.max_n_shift,
+       }, head = llq;
+       int ret = 0;
  
-       arm_smmu_cmdq_build_cmd(cmd, &ent);
+       /* 1. Allocate some space in the queue */
+       local_irq_save(flags);
+       llq.val = READ_ONCE(cmdq->q.llq.val);
+       do {
+               u64 old;
+
+               while (!queue_has_space(&llq, n + sync)) {
+                       local_irq_restore(flags);
+                       if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+                               dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
+                       local_irq_save(flags);
+               }
+
+               head.cons = llq.cons;
+               head.prod = queue_inc_prod_n(&llq, n + sync) |
+                                            CMDQ_PROD_OWNED_FLAG;
+
+               old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
+               if (old == llq.val)
+                       break;
+
+               llq.val = old;
+       } while (1);
+       owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
+       head.prod &= ~CMDQ_PROD_OWNED_FLAG;
+       llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+       /*
+        * 2. Write our commands into the queue
+        * Dependency ordering from the cmpxchg() loop above.
+        */
+       arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
+       if (sync) {
+               prod = queue_inc_prod_n(&llq, n);
+               arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
+               queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
+
+               /*
+                * In order to determine completion of our CMD_SYNC, we must
+                * ensure that the queue can't wrap twice without us noticing.
+                * We achieve that by taking the cmdq lock as shared before
+                * marking our slot as valid.
+                */
+               arm_smmu_cmdq_shared_lock(cmdq);
+       }
  
-       spin_lock_irqsave(&smmu->cmdq.lock, flags);
-       arm_smmu_cmdq_insert_cmd(smmu, cmd);
-       ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
-       spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
+       /* 3. Mark our slots as valid, ensuring commands are visible first */
+       dma_wmb();
+       arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
  
+       /* 4. If we are the owner, take control of the SMMU hardware */
+       if (owner) {
+               /* a. Wait for previous owner to finish */
+               atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
+
+               /* b. Stop gathering work by clearing the owned flag */
+               prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
+                                                  &cmdq->q.llq.atomic.prod);
+               prod &= ~CMDQ_PROD_OWNED_FLAG;
+
+               /*
+                * c. Wait for any gathered work to be written to the queue.
+                * Note that we read our own entries so that we have the control
+                * dependency required by (d).
+                */
+               arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
+
+               /*
+                * d. Advance the hardware prod pointer
+                * Control dependency ordering from the entries becoming valid.
+                */
+               writel_relaxed(prod, cmdq->q.prod_reg);
+
+               /*
+                * e. Tell the next owner we're done
+                * Make sure we've updated the hardware first, so that we don't
+                * race to update prod and potentially move it backwards.
+                */
+               atomic_set_release(&cmdq->owner_prod, prod);
+       }
+
+       /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
+       if (sync) {
+               llq.prod = queue_inc_prod_n(&llq, n);
+               ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+               if (ret) {
+                       dev_err_ratelimited(smmu->dev,
+                                           "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
+                                           llq.prod,
+                                           readl_relaxed(cmdq->q.prod_reg),
+                                           readl_relaxed(cmdq->q.cons_reg));
+               }
+
+               /*
+                * Try to unlock the cmq lock. This will fail if we're the last
+                * reader, in which case we can safely update cmdq->q.llq.cons
+                */
+               if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
+                       WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
+                       arm_smmu_cmdq_shared_unlock(cmdq);
+               }
+       }
+
+       local_irq_restore(flags);
         return ret;
  }
  
-static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
+                                  struct arm_smmu_cmdq_ent *ent)
  {
-       int ret;
-       bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
-                  (smmu->features & ARM_SMMU_FEAT_COHERENCY);
+       u64 cmd[CMDQ_ENT_DWORDS];
  
-       ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
-                 : __arm_smmu_cmdq_issue_sync(smmu);
-       if (ret)
-               dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
-       return ret;
+       if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
+               dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
+                        ent->opcode);
+               return -EINVAL;
+       }
+
+       return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
+}
+
+static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
+{
+       return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
  }
  
  /* Context descriptor manipulation functions */
@@ -1580,9 +1932,9 @@ static void arm_smmu_tlb_inv_context(void *cookie)
         /*
          * NOTE: when io-pgtable is in non-strict mode, we may get here with
          * PTEs previously cleared by unmaps on the current CPU not yet visible
-        * to the SMMU. We are relying on the DSB implicit in
-        * queue_sync_prod_out() to guarantee those are observed before the
-        * TLBI. Do be careful, 007.
+        * to the SMMU. We are relying on the dma_wmb() implicit during cmd
+        * insertion to guarantee those are observed before the TLBI. Do be
+        * careful, 007.
          */
         arm_smmu_cmdq_issue_cmd(smmu, &cmd);
         arm_smmu_cmdq_issue_sync(smmu);
@@ -2359,18 +2711,49 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
         return 0;
  }
  
+static void arm_smmu_cmdq_free_bitmap(void *data)
+{
+       unsigned long *bitmap = data;
+       bitmap_free(bitmap);
+}
+
+static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+{
+       int ret = 0;
+       struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+       unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
+       atomic_long_t *bitmap;
+
+       atomic_set(&cmdq->owner_prod, 0);
+       atomic_set(&cmdq->lock, 0);
+
+       bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
+       if (!bitmap) {
+               dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
+               ret = -ENOMEM;
+       } else {
+               cmdq->valid_map = bitmap;
+               devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
+       }
+
+       return ret;
+}
+
  static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
  {
         int ret;
  
         /* cmdq */
-       spin_lock_init(&smmu->cmdq.lock);
         ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
                                       ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
                                       "cmdq");
         if (ret)
                 return ret;
  
+       ret = arm_smmu_cmdq_init(smmu);
+       if (ret)
+               return ret;
+
         /* evtq */
         ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
                                       ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
@@ -2951,9 +3334,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
         /* Queue sizes, capped to ensure natural alignment */
         smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
                                              FIELD_GET(IDR1_CMDQS, reg));
-       if (!smmu->cmdq.q.llq.max_n_shift) {
-               /* Odd alignment restrictions on the base, so ignore for now */
-               dev_err(smmu->dev, "unit-length command queue not supported\n");
+       if (smmu->cmdq.q.llq.max_n_shift < ilog2(BITS_PER_LONG)) {
+               /*
+                * The cmdq valid_map relies on the total number of entries
+                * being a multiple of BITS_PER_LONG. There's also no way
+                * we can handle the weird alignment restrictions on the
+                * base pointer for a unit-length queue.
+                */
+               dev_err(smmu->dev, "command queue size < %d entries not supported\n",
+                       BITS_PER_LONG);
                 return -ENXIO;
         }
author	Will Deacon <[email protected]>
	Tue, 2 Jul 2019 16:16:25 +0000 (17:16 +0100)
committer	Will Deacon <[email protected]>
	Thu, 8 Aug 2019 12:31:54 +0000 (13:31 +0100)