1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 * Copyright(c) 2015 - 2020 Intel Corporation.
6 #include <linux/topology.h>
7 #include <linux/cpumask.h>
8 #include <linux/interrupt.h>
9 #include <linux/numa.h>
16 struct hfi1_affinity_node_list node_affinity = {
17 .list = LIST_HEAD_INIT(node_affinity.list),
18 .lock = __MUTEX_INITIALIZER(node_affinity.lock)
21 /* Name of IRQ types, indexed by enum irq_type */
22 static const char * const irq_type_names[] = {
30 /* Per NUMA node count of HFI devices */
31 static unsigned int *hfi1_per_node_cntr;
33 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
35 cpumask_clear(&set->mask);
36 cpumask_clear(&set->used);
40 /* Increment generation of CPU set if needed */
41 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
43 if (cpumask_equal(&set->mask, &set->used)) {
45 * We've used up all the CPUs, bump up the generation
46 * and reset the 'used' map
49 cpumask_clear(&set->used);
53 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
55 if (cpumask_empty(&set->used) && set->gen) {
57 cpumask_copy(&set->used, &set->mask);
61 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
62 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
69 _cpu_mask_set_gen_inc(set);
71 /* Find out CPUs left in CPU mask */
72 cpumask_andnot(diff, &set->mask, &set->used);
74 cpu = cpumask_first(diff);
75 if (cpu >= nr_cpu_ids) /* empty */
78 cpumask_set_cpu(cpu, &set->used);
83 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
88 cpumask_clear_cpu(cpu, &set->used);
89 _cpu_mask_set_gen_dec(set);
92 /* Initialize non-HT cpu cores mask */
93 void init_real_cpu_mask(void)
95 int possible, curr_cpu, i, ht;
97 cpumask_clear(&node_affinity.real_cpu_mask);
99 /* Start with cpu online mask as the real cpu mask */
100 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
103 * Remove HT cores from the real cpu mask. Do this in two steps below.
105 possible = cpumask_weight(&node_affinity.real_cpu_mask);
106 ht = cpumask_weight(topology_sibling_cpumask(
107 cpumask_first(&node_affinity.real_cpu_mask)));
109 * Step 1. Skip over the first N HT siblings and use them as the
110 * "real" cores. Assumes that HT cores are not enumerated in
111 * succession (except in the single core case).
113 curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
114 for (i = 0; i < possible / ht; i++)
115 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
117 * Step 2. Remove the remaining HT siblings. Use cpumask_next() to
120 for (; i < possible; i++) {
121 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
122 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
126 int node_affinity_init(void)
129 struct pci_dev *dev = NULL;
130 const struct pci_device_id *ids = hfi1_pci_tbl;
132 cpumask_clear(&node_affinity.proc.used);
133 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
135 node_affinity.proc.gen = 0;
136 node_affinity.num_core_siblings =
137 cpumask_weight(topology_sibling_cpumask(
138 cpumask_first(&node_affinity.proc.mask)
140 node_affinity.num_possible_nodes = num_possible_nodes();
141 node_affinity.num_online_nodes = num_online_nodes();
142 node_affinity.num_online_cpus = num_online_cpus();
145 * The real cpu mask is part of the affinity struct but it has to be
146 * initialized early. It is needed to calculate the number of user
147 * contexts in set_up_context_variables().
149 init_real_cpu_mask();
151 hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
152 sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
153 if (!hfi1_per_node_cntr)
156 while (ids->vendor) {
158 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
159 node = pcibus_to_node(dev->bus);
163 hfi1_per_node_cntr[node]++;
172 * Invalid PCI NUMA node information found, note it, and populate
175 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
176 pr_err("HFI: System BIOS may need to be upgraded\n");
177 for (node = 0; node < node_affinity.num_possible_nodes; node++)
178 hfi1_per_node_cntr[node] = 1;
185 static void node_affinity_destroy(struct hfi1_affinity_node *entry)
187 free_percpu(entry->comp_vect_affinity);
191 void node_affinity_destroy_all(void)
193 struct list_head *pos, *q;
194 struct hfi1_affinity_node *entry;
196 mutex_lock(&node_affinity.lock);
197 list_for_each_safe(pos, q, &node_affinity.list) {
198 entry = list_entry(pos, struct hfi1_affinity_node,
201 node_affinity_destroy(entry);
203 mutex_unlock(&node_affinity.lock);
204 kfree(hfi1_per_node_cntr);
207 static struct hfi1_affinity_node *node_affinity_allocate(int node)
209 struct hfi1_affinity_node *entry;
211 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
215 entry->comp_vect_affinity = alloc_percpu(u16);
216 INIT_LIST_HEAD(&entry->list);
222 * It appends an entry to the list.
223 * It *must* be called with node_affinity.lock held.
225 static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
227 list_add_tail(&entry->list, &node_affinity.list);
230 /* It must be called with node_affinity.lock held */
231 static struct hfi1_affinity_node *node_affinity_lookup(int node)
233 struct list_head *pos;
234 struct hfi1_affinity_node *entry;
236 list_for_each(pos, &node_affinity.list) {
237 entry = list_entry(pos, struct hfi1_affinity_node, list);
238 if (entry->node == node)
245 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
246 u16 __percpu *comp_vect_affinity)
253 if (!possible_cpumask) {
258 if (!comp_vect_affinity) {
263 ret_cpu = cpumask_first(possible_cpumask);
264 if (ret_cpu >= nr_cpu_ids) {
269 prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
270 for_each_cpu(curr_cpu, possible_cpumask) {
271 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
273 if (cntr < prev_cntr) {
279 *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
285 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
286 u16 __percpu *comp_vect_affinity)
293 if (!possible_cpumask)
296 if (!comp_vect_affinity)
299 max_cpu = cpumask_first(possible_cpumask);
300 if (max_cpu >= nr_cpu_ids)
303 prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
304 for_each_cpu(curr_cpu, possible_cpumask) {
305 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
307 if (cntr > prev_cntr) {
313 *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
319 * Non-interrupt CPUs are used first, then interrupt CPUs.
320 * Two already allocated cpu masks must be passed.
322 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
323 struct hfi1_affinity_node *entry,
324 cpumask_var_t non_intr_cpus,
325 cpumask_var_t available_cpus)
326 __must_hold(&node_affinity.lock)
329 struct cpu_mask_set *set = dd->comp_vect;
331 lockdep_assert_held(&node_affinity.lock);
332 if (!non_intr_cpus) {
337 if (!available_cpus) {
342 /* Available CPUs for pinning completion vectors */
343 _cpu_mask_set_gen_inc(set);
344 cpumask_andnot(available_cpus, &set->mask, &set->used);
346 /* Available CPUs without SDMA engine interrupts */
347 cpumask_andnot(non_intr_cpus, available_cpus,
348 &entry->def_intr.used);
350 /* If there are non-interrupt CPUs available, use them first */
351 if (!cpumask_empty(non_intr_cpus))
352 cpu = cpumask_first(non_intr_cpus);
353 else /* Otherwise, use interrupt CPUs */
354 cpu = cpumask_first(available_cpus);
356 if (cpu >= nr_cpu_ids) { /* empty */
360 cpumask_set_cpu(cpu, &set->used);
366 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
368 struct cpu_mask_set *set = dd->comp_vect;
373 cpu_mask_set_put(set, cpu);
376 /* _dev_comp_vect_mappings_destroy() is reentrant */
377 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
381 if (!dd->comp_vect_mappings)
384 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
385 cpu = dd->comp_vect_mappings[i];
386 _dev_comp_vect_cpu_put(dd, cpu);
387 dd->comp_vect_mappings[i] = -1;
389 "[%s] Release CPU %d from completion vector %d",
390 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
393 kfree(dd->comp_vect_mappings);
394 dd->comp_vect_mappings = NULL;
398 * This function creates the table for looking up CPUs for completion vectors.
399 * num_comp_vectors needs to have been initilized before calling this function.
401 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
402 struct hfi1_affinity_node *entry)
403 __must_hold(&node_affinity.lock)
406 cpumask_var_t non_intr_cpus;
407 cpumask_var_t available_cpus;
409 lockdep_assert_held(&node_affinity.lock);
411 if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
414 if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
415 free_cpumask_var(non_intr_cpus);
419 dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
420 sizeof(*dd->comp_vect_mappings),
422 if (!dd->comp_vect_mappings) {
426 for (i = 0; i < dd->comp_vect_possible_cpus; i++)
427 dd->comp_vect_mappings[i] = -1;
429 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
430 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
437 dd->comp_vect_mappings[i] = cpu;
439 "[%s] Completion Vector %d -> CPU %d",
440 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
443 free_cpumask_var(available_cpus);
444 free_cpumask_var(non_intr_cpus);
448 free_cpumask_var(available_cpus);
449 free_cpumask_var(non_intr_cpus);
450 _dev_comp_vect_mappings_destroy(dd);
455 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
458 struct hfi1_affinity_node *entry;
460 mutex_lock(&node_affinity.lock);
461 entry = node_affinity_lookup(dd->node);
466 ret = _dev_comp_vect_mappings_create(dd, entry);
468 mutex_unlock(&node_affinity.lock);
473 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
475 _dev_comp_vect_mappings_destroy(dd);
478 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
480 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
481 struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
483 if (!dd->comp_vect_mappings)
485 if (comp_vect >= dd->comp_vect_possible_cpus)
488 return dd->comp_vect_mappings[comp_vect];
492 * It assumes dd->comp_vect_possible_cpus is available.
494 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
495 struct hfi1_affinity_node *entry,
497 __must_hold(&node_affinity.lock)
500 int possible_cpus_comp_vect = 0;
501 struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
503 lockdep_assert_held(&node_affinity.lock);
505 * If there's only one CPU available for completion vectors, then
506 * there will only be one completion vector available. Othewise,
507 * the number of completion vector available will be the number of
508 * available CPUs divide it by the number of devices in the
511 if (cpumask_weight(&entry->comp_vect_mask) == 1) {
512 possible_cpus_comp_vect = 1;
514 "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
516 possible_cpus_comp_vect +=
517 cpumask_weight(&entry->comp_vect_mask) /
518 hfi1_per_node_cntr[dd->node];
521 * If the completion vector CPUs available doesn't divide
522 * evenly among devices, then the first device device to be
523 * initialized gets an extra CPU.
525 if (first_dev_init &&
526 cpumask_weight(&entry->comp_vect_mask) %
527 hfi1_per_node_cntr[dd->node] != 0)
528 possible_cpus_comp_vect++;
531 dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
533 /* Reserving CPUs for device completion vector */
534 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
535 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
536 entry->comp_vect_affinity);
540 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
544 "[%s] Completion vector affinity CPU set(s) %*pbl",
545 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
546 cpumask_pr_args(dev_comp_vect_mask));
551 for (j = 0; j < i; j++)
552 per_cpu_affinity_put_max(&entry->comp_vect_mask,
553 entry->comp_vect_affinity);
559 * It assumes dd->comp_vect_possible_cpus is available.
561 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
562 struct hfi1_affinity_node *entry)
563 __must_hold(&node_affinity.lock)
567 lockdep_assert_held(&node_affinity.lock);
568 if (!dd->comp_vect_possible_cpus)
571 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
572 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
573 entry->comp_vect_affinity);
574 /* Clearing CPU in device completion vector cpu mask */
576 cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
579 dd->comp_vect_possible_cpus = 0;
583 * Interrupt affinity.
585 * non-rcv avail gets a default mask that
586 * starts as possible cpus with threads reset
587 * and each rcv avail reset.
589 * rcv avail gets node relative 1 wrapping back
590 * to the node relative 1 as necessary.
593 int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
595 struct hfi1_affinity_node *entry;
596 const struct cpumask *local_mask;
597 int curr_cpu, possible, i, ret;
598 bool new_entry = false;
600 local_mask = cpumask_of_node(dd->node);
601 if (cpumask_first(local_mask) >= nr_cpu_ids)
602 local_mask = topology_core_cpumask(0);
604 mutex_lock(&node_affinity.lock);
605 entry = node_affinity_lookup(dd->node);
608 * If this is the first time this NUMA node's affinity is used,
609 * create an entry in the global affinity structure and initialize it.
612 entry = node_affinity_allocate(dd->node);
615 "Unable to allocate global affinity node\n");
621 init_cpu_mask_set(&entry->def_intr);
622 init_cpu_mask_set(&entry->rcv_intr);
623 cpumask_clear(&entry->comp_vect_mask);
624 cpumask_clear(&entry->general_intr_mask);
625 /* Use the "real" cpu mask of this node as the default */
626 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
629 /* fill in the receive list */
630 possible = cpumask_weight(&entry->def_intr.mask);
631 curr_cpu = cpumask_first(&entry->def_intr.mask);
634 /* only one CPU, everyone will use it */
635 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
636 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
639 * The general/control context will be the first CPU in
640 * the default list, so it is removed from the default
641 * list and added to the general interrupt list.
643 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
644 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
645 curr_cpu = cpumask_next(curr_cpu,
646 &entry->def_intr.mask);
649 * Remove the remaining kernel receive queues from
650 * the default list and add them to the receive list.
653 i < (dd->n_krcv_queues - 1) *
654 hfi1_per_node_cntr[dd->node];
656 cpumask_clear_cpu(curr_cpu,
657 &entry->def_intr.mask);
658 cpumask_set_cpu(curr_cpu,
659 &entry->rcv_intr.mask);
660 curr_cpu = cpumask_next(curr_cpu,
661 &entry->def_intr.mask);
662 if (curr_cpu >= nr_cpu_ids)
667 * If there ends up being 0 CPU cores leftover for SDMA
668 * engines, use the same CPU cores as general/control
671 if (cpumask_empty(&entry->def_intr.mask))
672 cpumask_copy(&entry->def_intr.mask,
673 &entry->general_intr_mask);
676 /* Determine completion vector CPUs for the entire node */
677 cpumask_and(&entry->comp_vect_mask,
678 &node_affinity.real_cpu_mask, local_mask);
679 cpumask_andnot(&entry->comp_vect_mask,
680 &entry->comp_vect_mask,
681 &entry->rcv_intr.mask);
682 cpumask_andnot(&entry->comp_vect_mask,
683 &entry->comp_vect_mask,
684 &entry->general_intr_mask);
687 * If there ends up being 0 CPU cores leftover for completion
688 * vectors, use the same CPU core as the general/control
691 if (cpumask_empty(&entry->comp_vect_mask))
692 cpumask_copy(&entry->comp_vect_mask,
693 &entry->general_intr_mask);
696 ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
701 node_affinity_add_tail(entry);
703 dd->affinity_entry = entry;
704 mutex_unlock(&node_affinity.lock);
710 node_affinity_destroy(entry);
711 mutex_unlock(&node_affinity.lock);
715 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
717 struct hfi1_affinity_node *entry;
719 mutex_lock(&node_affinity.lock);
720 if (!dd->affinity_entry)
722 entry = node_affinity_lookup(dd->node);
727 * Free device completion vector CPUs to be used by future
730 _dev_comp_vect_cpu_mask_clean_up(dd, entry);
732 dd->affinity_entry = NULL;
733 mutex_unlock(&node_affinity.lock);
737 * Function updates the irq affinity hint for msix after it has been changed
738 * by the user using the /proc/irq interface. This function only accepts
739 * one cpu in the mask.
741 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
743 struct sdma_engine *sde = msix->arg;
744 struct hfi1_devdata *dd = sde->dd;
745 struct hfi1_affinity_node *entry;
746 struct cpu_mask_set *set;
749 if (cpu > num_online_cpus() || cpu == sde->cpu)
752 mutex_lock(&node_affinity.lock);
753 entry = node_affinity_lookup(dd->node);
759 cpumask_clear(&msix->mask);
760 cpumask_set_cpu(cpu, &msix->mask);
761 dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
762 msix->irq, irq_type_names[msix->type],
764 irq_set_affinity_hint(msix->irq, &msix->mask);
767 * Set the new cpu in the hfi1_affinity_node and clean
768 * the old cpu if it is not used by any other IRQ
770 set = &entry->def_intr;
771 cpumask_set_cpu(cpu, &set->mask);
772 cpumask_set_cpu(cpu, &set->used);
773 for (i = 0; i < dd->msix_info.max_requested; i++) {
774 struct hfi1_msix_entry *other_msix;
776 other_msix = &dd->msix_info.msix_entries[i];
777 if (other_msix->type != IRQ_SDMA || other_msix == msix)
780 if (cpumask_test_cpu(old_cpu, &other_msix->mask))
783 cpumask_clear_cpu(old_cpu, &set->mask);
784 cpumask_clear_cpu(old_cpu, &set->used);
786 mutex_unlock(&node_affinity.lock);
789 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
790 const cpumask_t *mask)
792 int cpu = cpumask_first(mask);
793 struct hfi1_msix_entry *msix = container_of(notify,
794 struct hfi1_msix_entry,
797 /* Only one CPU configuration supported currently */
798 hfi1_update_sdma_affinity(msix, cpu);
801 static void hfi1_irq_notifier_release(struct kref *ref)
804 * This is required by affinity notifier. We don't have anything to
809 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
811 struct irq_affinity_notify *notify = &msix->notify;
813 notify->irq = msix->irq;
814 notify->notify = hfi1_irq_notifier_notify;
815 notify->release = hfi1_irq_notifier_release;
817 if (irq_set_affinity_notifier(notify->irq, notify))
818 pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
822 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
824 struct irq_affinity_notify *notify = &msix->notify;
826 if (irq_set_affinity_notifier(notify->irq, NULL))
827 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
832 * Function sets the irq affinity for msix.
833 * It *must* be called with node_affinity.lock held.
835 static int get_irq_affinity(struct hfi1_devdata *dd,
836 struct hfi1_msix_entry *msix)
839 struct hfi1_affinity_node *entry;
840 struct cpu_mask_set *set = NULL;
841 struct sdma_engine *sde = NULL;
842 struct hfi1_ctxtdata *rcd = NULL;
847 cpumask_clear(&msix->mask);
849 entry = node_affinity_lookup(dd->node);
851 switch (msix->type) {
853 sde = (struct sdma_engine *)msix->arg;
854 scnprintf(extra, 64, "engine %u", sde->this_idx);
855 set = &entry->def_intr;
858 cpu = cpumask_first(&entry->general_intr_mask);
861 rcd = (struct hfi1_ctxtdata *)msix->arg;
862 if (rcd->ctxt == HFI1_CTRL_CTXT)
863 cpu = cpumask_first(&entry->general_intr_mask);
865 set = &entry->rcv_intr;
866 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
869 rcd = (struct hfi1_ctxtdata *)msix->arg;
870 set = &entry->def_intr;
871 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
874 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
879 * The general and control contexts are placed on a particular
880 * CPU, which is set above. Skip accounting for it. Everything else
881 * finds its CPU here.
883 if (cpu == -1 && set) {
884 if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
887 cpu = cpu_mask_set_get_first(set, diff);
889 free_cpumask_var(diff);
890 dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
894 free_cpumask_var(diff);
897 cpumask_set_cpu(cpu, &msix->mask);
898 dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
899 msix->irq, irq_type_names[msix->type],
901 irq_set_affinity_hint(msix->irq, &msix->mask);
903 if (msix->type == IRQ_SDMA) {
905 hfi1_setup_sdma_notifier(msix);
911 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
915 mutex_lock(&node_affinity.lock);
916 ret = get_irq_affinity(dd, msix);
917 mutex_unlock(&node_affinity.lock);
921 void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
922 struct hfi1_msix_entry *msix)
924 struct cpu_mask_set *set = NULL;
925 struct hfi1_affinity_node *entry;
927 mutex_lock(&node_affinity.lock);
928 entry = node_affinity_lookup(dd->node);
930 switch (msix->type) {
932 set = &entry->def_intr;
933 hfi1_cleanup_sdma_notifier(msix);
936 /* Don't do accounting for general contexts */
939 struct hfi1_ctxtdata *rcd = msix->arg;
941 /* Don't do accounting for control contexts */
942 if (rcd->ctxt != HFI1_CTRL_CTXT)
943 set = &entry->rcv_intr;
947 set = &entry->def_intr;
950 mutex_unlock(&node_affinity.lock);
955 cpumask_andnot(&set->used, &set->used, &msix->mask);
956 _cpu_mask_set_gen_dec(set);
959 irq_set_affinity_hint(msix->irq, NULL);
960 cpumask_clear(&msix->mask);
961 mutex_unlock(&node_affinity.lock);
964 /* This should be called with node_affinity.lock held */
965 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
966 struct hfi1_affinity_node_list *affinity)
968 int possible, curr_cpu, i;
969 uint num_cores_per_socket = node_affinity.num_online_cpus /
970 affinity->num_core_siblings /
971 node_affinity.num_online_nodes;
973 cpumask_copy(hw_thread_mask, &affinity->proc.mask);
974 if (affinity->num_core_siblings > 0) {
975 /* Removing other siblings not needed for now */
976 possible = cpumask_weight(hw_thread_mask);
977 curr_cpu = cpumask_first(hw_thread_mask);
979 i < num_cores_per_socket * node_affinity.num_online_nodes;
981 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
983 for (; i < possible; i++) {
984 cpumask_clear_cpu(curr_cpu, hw_thread_mask);
985 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
988 /* Identifying correct HW threads within physical cores */
989 cpumask_shift_left(hw_thread_mask, hw_thread_mask,
990 num_cores_per_socket *
991 node_affinity.num_online_nodes *
996 int hfi1_get_proc_affinity(int node)
998 int cpu = -1, ret, i;
999 struct hfi1_affinity_node *entry;
1000 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
1001 const struct cpumask *node_mask,
1002 *proc_mask = current->cpus_ptr;
1003 struct hfi1_affinity_node_list *affinity = &node_affinity;
1004 struct cpu_mask_set *set = &affinity->proc;
1007 * check whether process/context affinity has already
1010 if (current->nr_cpus_allowed == 1) {
1011 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1012 current->pid, current->comm,
1013 cpumask_pr_args(proc_mask));
1015 * Mark the pre-set CPU as used. This is atomic so we don't
1018 cpu = cpumask_first(proc_mask);
1019 cpumask_set_cpu(cpu, &set->used);
1021 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1022 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1023 current->pid, current->comm,
1024 cpumask_pr_args(proc_mask));
1029 * The process does not have a preset CPU affinity so find one to
1030 * recommend using the following algorithm:
1032 * For each user process that is opening a context on HFI Y:
1033 * a) If all cores are filled, reinitialize the bitmask
1034 * b) Fill real cores first, then HT cores (First set of HT
1035 * cores on all physical cores, then second set of HT core,
1036 * and, so on) in the following order:
1038 * 1. Same NUMA node as HFI Y and not running an IRQ
1040 * 2. Same NUMA node as HFI Y and running an IRQ handler
1041 * 3. Different NUMA node to HFI Y and not running an IRQ
1043 * 4. Different NUMA node to HFI Y and running an IRQ
1045 * c) Mark core as filled in the bitmask. As user processes are
1046 * done, clear cores from the bitmask.
1049 ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1052 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1055 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1057 goto free_hw_thread_mask;
1058 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1060 goto free_available_mask;
1062 mutex_lock(&affinity->lock);
1064 * If we've used all available HW threads, clear the mask and start
1067 _cpu_mask_set_gen_inc(set);
1070 * If NUMA node has CPUs used by interrupt handlers, include them in the
1071 * interrupt handler mask.
1073 entry = node_affinity_lookup(node);
1075 cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1076 &entry->def_intr.mask :
1077 &entry->def_intr.used));
1078 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1079 &entry->rcv_intr.mask :
1080 &entry->rcv_intr.used));
1081 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1083 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1084 cpumask_pr_args(intrs_mask));
1086 cpumask_copy(hw_thread_mask, &set->mask);
1089 * If HT cores are enabled, identify which HW threads within the
1090 * physical cores should be used.
1092 if (affinity->num_core_siblings > 0) {
1093 for (i = 0; i < affinity->num_core_siblings; i++) {
1094 find_hw_thread_mask(i, hw_thread_mask, affinity);
1097 * If there's at least one available core for this HW
1098 * thread number, stop looking for a core.
1100 * diff will always be not empty at least once in this
1101 * loop as the used mask gets reset when
1102 * (set->mask == set->used) before this loop.
1104 cpumask_andnot(diff, hw_thread_mask, &set->used);
1105 if (!cpumask_empty(diff))
1109 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1110 cpumask_pr_args(hw_thread_mask));
1112 node_mask = cpumask_of_node(node);
1113 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1114 cpumask_pr_args(node_mask));
1116 /* Get cpumask of available CPUs on preferred NUMA */
1117 cpumask_and(available_mask, hw_thread_mask, node_mask);
1118 cpumask_andnot(available_mask, available_mask, &set->used);
1119 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1120 cpumask_pr_args(available_mask));
1123 * At first, we don't want to place processes on the same
1124 * CPUs as interrupt handlers. Then, CPUs running interrupt
1125 * handlers are used.
1127 * 1) If diff is not empty, then there are CPUs not running
1128 * non-interrupt handlers available, so diff gets copied
1129 * over to available_mask.
1130 * 2) If diff is empty, then all CPUs not running interrupt
1131 * handlers are taken, so available_mask contains all
1132 * available CPUs running interrupt handlers.
1133 * 3) If available_mask is empty, then all CPUs on the
1134 * preferred NUMA node are taken, so other NUMA nodes are
1135 * used for process assignments using the same method as
1136 * the preferred NUMA node.
1138 cpumask_andnot(diff, available_mask, intrs_mask);
1139 if (!cpumask_empty(diff))
1140 cpumask_copy(available_mask, diff);
1142 /* If we don't have CPUs on the preferred node, use other NUMA nodes */
1143 if (cpumask_empty(available_mask)) {
1144 cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1145 /* Excluding preferred NUMA cores */
1146 cpumask_andnot(available_mask, available_mask, node_mask);
1148 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1149 cpumask_pr_args(available_mask));
1152 * At first, we don't want to place processes on the same
1153 * CPUs as interrupt handlers.
1155 cpumask_andnot(diff, available_mask, intrs_mask);
1156 if (!cpumask_empty(diff))
1157 cpumask_copy(available_mask, diff);
1159 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1160 cpumask_pr_args(available_mask));
1162 cpu = cpumask_first(available_mask);
1163 if (cpu >= nr_cpu_ids) /* empty */
1166 cpumask_set_cpu(cpu, &set->used);
1168 mutex_unlock(&affinity->lock);
1169 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1171 free_cpumask_var(intrs_mask);
1172 free_available_mask:
1173 free_cpumask_var(available_mask);
1174 free_hw_thread_mask:
1175 free_cpumask_var(hw_thread_mask);
1177 free_cpumask_var(diff);
1182 void hfi1_put_proc_affinity(int cpu)
1184 struct hfi1_affinity_node_list *affinity = &node_affinity;
1185 struct cpu_mask_set *set = &affinity->proc;
1190 mutex_lock(&affinity->lock);
1191 cpu_mask_set_put(set, cpu);
1192 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1193 mutex_unlock(&affinity->lock);