1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2022 Intel Corporation. */
5 #include <linux/delay.h>
8 #include <linux/slab.h>
9 #include <linux/stop_machine.h>
14 * Note all code and data in this file is protected by
15 * ifs_sem. On HT systems all threads on a core will
16 * execute together, but only the first thread on the
17 * core will update results of the test.
20 #define CREATE_TRACE_POINTS
21 #include <trace/events/intel_ifs.h>
23 /* Max retries on the same chunk */
24 #define MAX_IFS_RETRIES 5
27 struct ifs_data *ifsd;
28 union ifs_scan *activate;
29 union ifs_status status;
33 * Number of TSC cycles that a logical CPU will wait for the other
34 * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
36 #define IFS_THREAD_WAIT 100000
38 enum ifs_status_err_code {
40 IFS_OTHER_THREAD_COULD_NOT_JOIN = 1,
41 IFS_INTERRUPTED_BEFORE_RENDEZVOUS = 2,
42 IFS_POWER_MGMT_INADEQUATE_FOR_SCAN = 3,
43 IFS_INVALID_CHUNK_RANGE = 4,
44 IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS = 5,
45 IFS_CORE_NOT_CAPABLE_CURRENTLY = 6,
46 IFS_UNASSIGNED_ERROR_CODE = 7,
47 IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT = 8,
48 IFS_INTERRUPTED_DURING_EXECUTION = 9,
49 IFS_UNASSIGNED_ERROR_CODE_0xA = 0xA,
50 IFS_CORRUPTED_CHUNK = 0xB,
53 static const char * const scan_test_status[] = {
54 [IFS_NO_ERROR] = "SCAN no error",
55 [IFS_OTHER_THREAD_COULD_NOT_JOIN] = "Other thread could not join.",
56 [IFS_INTERRUPTED_BEFORE_RENDEZVOUS] = "Interrupt occurred prior to SCAN coordination.",
57 [IFS_POWER_MGMT_INADEQUATE_FOR_SCAN] =
58 "Core Abort SCAN Response due to power management condition.",
59 [IFS_INVALID_CHUNK_RANGE] = "Non valid chunks in the range",
60 [IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS] = "Mismatch in arguments between threads T0/T1.",
61 [IFS_CORE_NOT_CAPABLE_CURRENTLY] = "Core not capable of performing SCAN currently",
62 [IFS_UNASSIGNED_ERROR_CODE] = "Unassigned error code 0x7",
63 [IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT] =
64 "Exceeded number of Logical Processors (LP) allowed to run Scan-At-Field concurrently",
65 [IFS_INTERRUPTED_DURING_EXECUTION] = "Interrupt occurred prior to SCAN start",
66 [IFS_UNASSIGNED_ERROR_CODE_0xA] = "Unassigned error code 0xA",
67 [IFS_CORRUPTED_CHUNK] = "Scan operation aborted due to corrupted image. Try reloading",
70 static void message_not_tested(struct device *dev, int cpu, union ifs_status status)
72 struct ifs_data *ifsd = ifs_get_data(dev);
75 * control_error is set when the microcode runs into a problem
76 * loading the image from the reserved BIOS memory, or it has
77 * been corrupted. Reloading the image may fix this issue.
79 if (status.control_error) {
80 dev_warn(dev, "CPU(s) %*pbl: Scan controller error. Batch: %02x version: 0x%x\n",
81 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
85 if (status.error_code < ARRAY_SIZE(scan_test_status)) {
86 dev_info(dev, "CPU(s) %*pbl: SCAN operation did not start. %s\n",
87 cpumask_pr_args(cpu_smt_mask(cpu)),
88 scan_test_status[status.error_code]);
89 } else if (status.error_code == IFS_SW_TIMEOUT) {
90 dev_info(dev, "CPU(s) %*pbl: software timeout during scan\n",
91 cpumask_pr_args(cpu_smt_mask(cpu)));
92 } else if (status.error_code == IFS_SW_PARTIAL_COMPLETION) {
93 dev_info(dev, "CPU(s) %*pbl: %s\n",
94 cpumask_pr_args(cpu_smt_mask(cpu)),
95 "Not all scan chunks were executed. Maximum forward progress retries exceeded");
97 dev_info(dev, "CPU(s) %*pbl: SCAN unknown status %llx\n",
98 cpumask_pr_args(cpu_smt_mask(cpu)), status.data);
102 static void message_fail(struct device *dev, int cpu, union ifs_status status)
104 struct ifs_data *ifsd = ifs_get_data(dev);
107 * signature_error is set when the output from the scan chains does not
108 * match the expected signature. This might be a transient problem (e.g.
109 * due to a bit flip from an alpha particle or neutron). If the problem
110 * repeats on a subsequent test, then it indicates an actual problem in
111 * the core being tested.
113 if (status.signature_error) {
114 dev_err(dev, "CPU(s) %*pbl: test signature incorrect. Batch: %02x version: 0x%x\n",
115 cpumask_pr_args(cpu_smt_mask(cpu)), ifsd->cur_batch, ifsd->loaded_version);
119 static bool can_restart(union ifs_status status)
121 enum ifs_status_err_code err_code = status.error_code;
123 /* Signature for chunk is bad, or scan test failed */
124 if (status.signature_error || status.control_error)
129 case IFS_OTHER_THREAD_COULD_NOT_JOIN:
130 case IFS_INTERRUPTED_BEFORE_RENDEZVOUS:
131 case IFS_POWER_MGMT_INADEQUATE_FOR_SCAN:
132 case IFS_EXCEED_NUMBER_OF_THREADS_CONCURRENT:
133 case IFS_INTERRUPTED_DURING_EXECUTION:
135 case IFS_INVALID_CHUNK_RANGE:
136 case IFS_MISMATCH_ARGUMENTS_BETWEEN_THREADS:
137 case IFS_CORE_NOT_CAPABLE_CURRENTLY:
138 case IFS_UNASSIGNED_ERROR_CODE:
139 case IFS_UNASSIGNED_ERROR_CODE_0xA:
140 case IFS_CORRUPTED_CHUNK:
146 #define SPINUNIT 100 /* 100 nsec */
147 static atomic_t array_cpus_in;
148 static atomic_t scan_cpus_in;
151 * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
153 static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
155 int cpu = smp_processor_id();
156 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
157 int all_cpus = cpumask_weight(smt_mask);
160 while (atomic_read(t) < all_cpus) {
161 if (timeout < SPINUNIT)
165 touch_nmi_watchdog();
170 * Execute the scan. Called "simultaneously" on all threads of a core
171 * at high priority using the stop_cpus mechanism.
173 static int doscan(void *data)
175 int cpu = smp_processor_id(), start, stop;
176 struct run_params *params = data;
177 union ifs_status status;
178 struct ifs_data *ifsd;
183 if (ifsd->generation) {
184 start = params->activate->gen2.start;
185 stop = params->activate->gen2.stop;
187 start = params->activate->gen0.start;
188 stop = params->activate->gen0.stop;
191 /* Only the first logical CPU on a core reports result */
192 first = cpumask_first(cpu_smt_mask(cpu));
194 wait_for_sibling_cpu(&scan_cpus_in, NSEC_PER_SEC);
197 * This WRMSR will wait for other HT threads to also write
198 * to this MSR (at most for activate.delay cycles). Then it
199 * starts scan of each requested chunk. The core scan happens
200 * during the "execution" of the WRMSR. This instruction can
201 * take up to 200 milliseconds (in the case where all chunks
202 * are processed in a single pass) before it retires.
204 wrmsrl(MSR_ACTIVATE_SCAN, params->activate->data);
205 rdmsrl(MSR_SCAN_STATUS, status.data);
207 trace_ifs_status(ifsd->cur_batch, start, stop, status.data);
209 /* Pass back the result of the scan */
211 params->status = status;
217 * Use stop_core_cpuslocked() to synchronize writing to MSR_ACTIVATE_SCAN
218 * on all threads of the core to be tested. Loop if necessary to complete
219 * run of all chunks. Include some defensive tests to make sure forward
220 * progress is made, and that the whole test completes in a reasonable time.
222 static void ifs_test_core(int cpu, struct device *dev)
224 union ifs_scan activate;
225 union ifs_status status;
226 unsigned long timeout;
227 struct ifs_data *ifsd;
228 int to_start, to_stop;
230 struct run_params params;
233 ifsd = ifs_get_data(dev);
235 activate.gen0.rsvd = 0;
236 activate.delay = IFS_THREAD_WAIT;
239 to_stop = ifsd->valid_chunks - 1;
241 params.ifsd = ifs_get_data(dev);
243 if (ifsd->generation) {
244 activate.gen2.start = to_start;
245 activate.gen2.stop = to_stop;
247 activate.gen0.start = to_start;
248 activate.gen0.stop = to_stop;
251 timeout = jiffies + HZ / 2;
252 retries = MAX_IFS_RETRIES;
254 while (to_start <= to_stop) {
255 if (time_after(jiffies, timeout)) {
256 status.error_code = IFS_SW_TIMEOUT;
260 params.activate = &activate;
261 atomic_set(&scan_cpus_in, 0);
262 stop_core_cpuslocked(cpu, doscan, ¶ms);
264 status = params.status;
266 /* Some cases can be retried, give up for others */
267 if (!can_restart(status))
270 status_chunk = ifsd->generation ? status.gen2.chunk_num : status.gen0.chunk_num;
271 if (status_chunk == to_start) {
272 /* Check for forward progress */
273 if (--retries == 0) {
274 if (status.error_code == IFS_NO_ERROR)
275 status.error_code = IFS_SW_PARTIAL_COMPLETION;
279 retries = MAX_IFS_RETRIES;
280 if (ifsd->generation)
281 activate.gen2.start = status_chunk;
283 activate.gen0.start = status_chunk;
284 to_start = status_chunk;
288 /* Update status for this core */
289 ifsd->scan_details = status.data;
291 if (status.signature_error) {
292 ifsd->status = SCAN_TEST_FAIL;
293 message_fail(dev, cpu, status);
294 } else if (status.control_error || status.error_code) {
295 ifsd->status = SCAN_NOT_TESTED;
296 message_not_tested(dev, cpu, status);
298 ifsd->status = SCAN_TEST_PASS;
302 static int do_array_test(void *data)
304 union ifs_array *command = data;
305 int cpu = smp_processor_id();
308 wait_for_sibling_cpu(&array_cpus_in, NSEC_PER_SEC);
311 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
313 first = cpumask_first(cpu_smt_mask(cpu));
316 wrmsrl(MSR_ARRAY_BIST, command->data);
317 /* Pass back the result of the test */
318 rdmsrl(MSR_ARRAY_BIST, command->data);
324 static void ifs_array_test_core(int cpu, struct device *dev)
326 union ifs_array command = {};
327 bool timed_out = false;
328 struct ifs_data *ifsd;
329 unsigned long timeout;
331 ifsd = ifs_get_data(dev);
333 command.array_bitmask = ~0U;
334 timeout = jiffies + HZ / 2;
337 if (time_after(jiffies, timeout)) {
341 atomic_set(&array_cpus_in, 0);
342 stop_core_cpuslocked(cpu, do_array_test, &command);
344 if (command.ctrl_result)
346 } while (command.array_bitmask);
348 ifsd->scan_details = command.data;
350 if (command.ctrl_result)
351 ifsd->status = SCAN_TEST_FAIL;
352 else if (timed_out || command.array_bitmask)
353 ifsd->status = SCAN_NOT_TESTED;
355 ifsd->status = SCAN_TEST_PASS;
358 #define ARRAY_GEN1_TEST_ALL_ARRAYS 0x0ULL
359 #define ARRAY_GEN1_STATUS_FAIL 0x1ULL
361 static int do_array_test_gen1(void *status)
363 int cpu = smp_processor_id();
366 first = cpumask_first(cpu_smt_mask(cpu));
369 wrmsrl(MSR_ARRAY_TRIGGER, ARRAY_GEN1_TEST_ALL_ARRAYS);
370 rdmsrl(MSR_ARRAY_STATUS, *((u64 *)status));
376 static void ifs_array_test_gen1(int cpu, struct device *dev)
378 struct ifs_data *ifsd = ifs_get_data(dev);
381 stop_core_cpuslocked(cpu, do_array_test_gen1, &status);
382 ifsd->scan_details = status;
384 if (status & ARRAY_GEN1_STATUS_FAIL)
385 ifsd->status = SCAN_TEST_FAIL;
387 ifsd->status = SCAN_TEST_PASS;
391 * Initiate per core test. It wakes up work queue threads on the target cpu and
392 * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
393 * wait for all sibling threads to finish the scan test.
395 int do_core_test(int cpu, struct device *dev)
397 const struct ifs_test_caps *test = ifs_get_test_caps(dev);
398 struct ifs_data *ifsd = ifs_get_data(dev);
401 /* Prevent CPUs from being taken offline during the scan test */
404 if (!cpu_online(cpu)) {
405 dev_info(dev, "cannot test on the offline cpu %d\n", cpu);
410 switch (test->test_num) {
415 ifs_test_core(cpu, dev);
417 case IFS_TYPE_ARRAY_BIST:
418 if (ifsd->array_gen == ARRAY_GEN0)
419 ifs_array_test_core(cpu, dev);
421 ifs_array_test_gen1(cpu, dev);