]> Git Repo - J-linux.git/blob - drivers/ptp/ptp_vmclock.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / drivers / ptp / ptp_vmclock.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Virtual PTP 1588 clock for use with LM-safe VMclock device.
4  *
5  * Copyright © 2024 Amazon.com, Inc. or its affiliates.
6  */
7
8 #include <linux/acpi.h>
9 #include <linux/device.h>
10 #include <linux/err.h>
11 #include <linux/file.h>
12 #include <linux/fs.h>
13 #include <linux/init.h>
14 #include <linux/kernel.h>
15 #include <linux/miscdevice.h>
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/platform_device.h>
19 #include <linux/slab.h>
20
21 #include <uapi/linux/vmclock-abi.h>
22
23 #include <linux/ptp_clock_kernel.h>
24
25 #ifdef CONFIG_X86
26 #include <asm/pvclock.h>
27 #include <asm/kvmclock.h>
28 #endif
29
30 #ifdef CONFIG_KVM_GUEST
31 #define SUPPORT_KVMCLOCK
32 #endif
33
34 static DEFINE_IDA(vmclock_ida);
35
36 ACPI_MODULE_NAME("vmclock");
37
38 struct vmclock_state {
39         struct resource res;
40         struct vmclock_abi *clk;
41         struct miscdevice miscdev;
42         struct ptp_clock_info ptp_clock_info;
43         struct ptp_clock *ptp_clock;
44         enum clocksource_ids cs_id, sys_cs_id;
45         int index;
46         char *name;
47 };
48
49 #define VMCLOCK_MAX_WAIT ms_to_ktime(100)
50
51 /* Require at least the flags field to be present. All else can be optional. */
52 #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
53
54 #define VMCLOCK_FIELD_PRESENT(_c, _f)                     \
55         (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \
56                                      sizeof((_c)->_f)))
57
58 /*
59  * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
60  * and add the fractional second part of the reference time.
61  *
62  * The result is a 128-bit value, the top 64 bits of which are seconds, and
63  * the low 64 bits are (seconds >> 64).
64  */
65 static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
66                                         uint64_t period, uint8_t shift,
67                                         uint64_t frac_sec)
68 {
69         unsigned __int128 res = (unsigned __int128)delta * period;
70
71         res >>= shift;
72         res += frac_sec;
73         *res_hi = res >> 64;
74         return (uint64_t)res;
75 }
76
77 static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
78 {
79         if (likely(clk->time_type == VMCLOCK_TIME_UTC))
80                 return true;
81
82         if (clk->time_type == VMCLOCK_TIME_TAI &&
83             (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
84                 if (sec)
85                         *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
86                 return true;
87         }
88         return false;
89 }
90
91 static int vmclock_get_crosststamp(struct vmclock_state *st,
92                                    struct ptp_system_timestamp *sts,
93                                    struct system_counterval_t *system_counter,
94                                    struct timespec64 *tspec)
95 {
96         ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
97         struct system_time_snapshot systime_snapshot;
98         uint64_t cycle, delta, seq, frac_sec;
99
100 #ifdef CONFIG_X86
101         /*
102          * We'd expect the hypervisor to know this and to report the clock
103          * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
104          */
105         if (check_tsc_unstable())
106                 return -EINVAL;
107 #endif
108
109         while (1) {
110                 seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
111
112                 /*
113                  * This pairs with a write barrier in the hypervisor
114                  * which populates this structure.
115                  */
116                 virt_rmb();
117
118                 if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
119                         return -EINVAL;
120
121                 /*
122                  * When invoked for gettimex64(), fill in the pre/post system
123                  * times. The simple case is when system time is based on the
124                  * same counter as st->cs_id, in which case all three times
125                  * will be derived from the *same* counter value.
126                  *
127                  * If the system isn't using the same counter, then the value
128                  * from ktime_get_snapshot() will still be used as pre_ts, and
129                  * ptp_read_system_postts() is called to populate postts after
130                  * calling get_cycles().
131                  *
132                  * The conversion to timespec64 happens further down, outside
133                  * the seq_count loop.
134                  */
135                 if (sts) {
136                         ktime_get_snapshot(&systime_snapshot);
137                         if (systime_snapshot.cs_id == st->cs_id) {
138                                 cycle = systime_snapshot.cycles;
139                         } else {
140                                 cycle = get_cycles();
141                                 ptp_read_system_postts(sts);
142                         }
143                 } else {
144                         cycle = get_cycles();
145                 }
146
147                 delta = cycle - le64_to_cpu(st->clk->counter_value);
148
149                 frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
150                                                    le64_to_cpu(st->clk->counter_period_frac_sec),
151                                                    st->clk->counter_period_shift,
152                                                    le64_to_cpu(st->clk->time_frac_sec));
153                 tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
154                 tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
155
156                 if (!tai_adjust(st->clk, &tspec->tv_sec))
157                         return -EINVAL;
158
159                 /*
160                  * This pairs with a write barrier in the hypervisor
161                  * which populates this structure.
162                  */
163                 virt_rmb();
164                 if (seq == le32_to_cpu(st->clk->seq_count))
165                         break;
166
167                 if (ktime_after(ktime_get(), deadline))
168                         return -ETIMEDOUT;
169         }
170
171         if (system_counter) {
172                 system_counter->cycles = cycle;
173                 system_counter->cs_id = st->cs_id;
174         }
175
176         if (sts) {
177                 sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
178                 if (systime_snapshot.cs_id == st->cs_id)
179                         sts->post_ts = sts->pre_ts;
180         }
181
182         return 0;
183 }
184
185 #ifdef SUPPORT_KVMCLOCK
186 /*
187  * In the case where the system is using the KVM clock for timekeeping, convert
188  * the TSC value into a KVM clock time in order to return a paired reading that
189  * get_device_system_crosststamp() can cope with.
190  */
191 static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
192                                             struct ptp_system_timestamp *sts,
193                                             struct system_counterval_t *system_counter,
194                                             struct timespec64 *tspec)
195 {
196         struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
197         unsigned int pvti_ver;
198         int ret;
199
200         preempt_disable_notrace();
201
202         do {
203                 pvti_ver = pvclock_read_begin(pvti);
204
205                 ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
206                 if (ret)
207                         break;
208
209                 system_counter->cycles = __pvclock_read_cycles(pvti,
210                                                                system_counter->cycles);
211                 system_counter->cs_id = CSID_X86_KVM_CLK;
212
213                 /*
214                  * This retry should never really happen; if the TSC is
215                  * stable and reliable enough across vCPUS that it is sane
216                  * for the hypervisor to expose a VMCLOCK device which uses
217                  * it as the reference counter, then the KVM clock sohuld be
218                  * in 'master clock mode' and basically never changed. But
219                  * the KVM clock is a fickle and often broken thing, so do
220                  * it "properly" just in case.
221                  */
222         } while (pvclock_read_retry(pvti, pvti_ver));
223
224         preempt_enable_notrace();
225
226         return ret;
227 }
228 #endif
229
230 static int ptp_vmclock_get_time_fn(ktime_t *device_time,
231                                    struct system_counterval_t *system_counter,
232                                    void *ctx)
233 {
234         struct vmclock_state *st = ctx;
235         struct timespec64 tspec;
236         int ret;
237
238 #ifdef SUPPORT_KVMCLOCK
239         if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
240                 ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
241                                                        &tspec);
242         else
243 #endif
244                 ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
245
246         if (!ret)
247                 *device_time = timespec64_to_ktime(tspec);
248
249         return ret;
250 }
251
252 static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
253                                       struct system_device_crosststamp *xtstamp)
254 {
255         struct vmclock_state *st = container_of(ptp, struct vmclock_state,
256                                                 ptp_clock_info);
257         int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
258                                                 NULL, xtstamp);
259 #ifdef SUPPORT_KVMCLOCK
260         /*
261          * On x86, the KVM clock may be used for the system time. We can
262          * actually convert a TSC reading to that, and return a paired
263          * timestamp that get_device_system_crosststamp() *can* handle.
264          */
265         if (ret == -ENODEV) {
266                 struct system_time_snapshot systime_snapshot;
267
268                 ktime_get_snapshot(&systime_snapshot);
269
270                 if (systime_snapshot.cs_id == CSID_X86_TSC ||
271                     systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
272                         WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
273                         ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
274                                                             st, NULL, xtstamp);
275                 }
276         }
277 #endif
278         return ret;
279 }
280
281 /*
282  * PTP clock operations
283  */
284
285 static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
286 {
287         return -EOPNOTSUPP;
288 }
289
290 static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
291 {
292         return -EOPNOTSUPP;
293 }
294
295 static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
296                            const struct timespec64 *ts)
297 {
298         return -EOPNOTSUPP;
299 }
300
301 static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
302                                 struct ptp_system_timestamp *sts)
303 {
304         struct vmclock_state *st = container_of(ptp, struct vmclock_state,
305                                                 ptp_clock_info);
306
307         return vmclock_get_crosststamp(st, sts, NULL, ts);
308 }
309
310 static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
311                           struct ptp_clock_request *rq, int on)
312 {
313         return -EOPNOTSUPP;
314 }
315
316 static const struct ptp_clock_info ptp_vmclock_info = {
317         .owner          = THIS_MODULE,
318         .max_adj        = 0,
319         .n_ext_ts       = 0,
320         .n_pins         = 0,
321         .pps            = 0,
322         .adjfine        = ptp_vmclock_adjfine,
323         .adjtime        = ptp_vmclock_adjtime,
324         .gettimex64     = ptp_vmclock_gettimex,
325         .settime64      = ptp_vmclock_settime,
326         .enable         = ptp_vmclock_enable,
327         .getcrosststamp = ptp_vmclock_getcrosststamp,
328 };
329
330 static struct ptp_clock *vmclock_ptp_register(struct device *dev,
331                                               struct vmclock_state *st)
332 {
333         enum clocksource_ids cs_id;
334
335         if (IS_ENABLED(CONFIG_ARM64) &&
336             st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
337                 /* Can we check it's the virtual counter? */
338                 cs_id = CSID_ARM_ARCH_COUNTER;
339         } else if (IS_ENABLED(CONFIG_X86) &&
340                    st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
341                 cs_id = CSID_X86_TSC;
342         } else {
343                 return NULL;
344         }
345
346         /* Only UTC, or TAI with offset */
347         if (!tai_adjust(st->clk, NULL)) {
348                 dev_info(dev, "vmclock does not provide unambiguous UTC\n");
349                 return NULL;
350         }
351
352         st->sys_cs_id = cs_id;
353         st->cs_id = cs_id;
354         st->ptp_clock_info = ptp_vmclock_info;
355         strscpy(st->ptp_clock_info.name, st->name);
356
357         return ptp_clock_register(&st->ptp_clock_info, dev);
358 }
359
360 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
361 {
362         struct vmclock_state *st = container_of(fp->private_data,
363                                                 struct vmclock_state, miscdev);
364
365         if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
366                 return -EROFS;
367
368         if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
369                 return -EINVAL;
370
371         if (io_remap_pfn_range(vma, vma->vm_start,
372                                st->res.start >> PAGE_SHIFT, PAGE_SIZE,
373                                vma->vm_page_prot))
374                 return -EAGAIN;
375
376         return 0;
377 }
378
379 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
380                                     size_t count, loff_t *ppos)
381 {
382         struct vmclock_state *st = container_of(fp->private_data,
383                                                 struct vmclock_state, miscdev);
384         ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
385         size_t max_count;
386         uint32_t seq;
387
388         if (*ppos >= PAGE_SIZE)
389                 return 0;
390
391         max_count = PAGE_SIZE - *ppos;
392         if (count > max_count)
393                 count = max_count;
394
395         while (1) {
396                 seq = le32_to_cpu(st->clk->seq_count) & ~1U;
397                 /* Pairs with hypervisor wmb */
398                 virt_rmb();
399
400                 if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
401                         return -EFAULT;
402
403                 /* Pairs with hypervisor wmb */
404                 virt_rmb();
405                 if (seq == le32_to_cpu(st->clk->seq_count))
406                         break;
407
408                 if (ktime_after(ktime_get(), deadline))
409                         return -ETIMEDOUT;
410         }
411
412         *ppos += count;
413         return count;
414 }
415
416 static const struct file_operations vmclock_miscdev_fops = {
417         .mmap = vmclock_miscdev_mmap,
418         .read = vmclock_miscdev_read,
419 };
420
421 /* module operations */
422
423 static void vmclock_remove(struct platform_device *pdev)
424 {
425         struct device *dev = &pdev->dev;
426         struct vmclock_state *st = dev_get_drvdata(dev);
427
428         if (st->ptp_clock)
429                 ptp_clock_unregister(st->ptp_clock);
430
431         if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
432                 misc_deregister(&st->miscdev);
433 }
434
435 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
436 {
437         struct vmclock_state *st = data;
438         struct resource_win win;
439         struct resource *res = &win.res;
440
441         if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
442                 return AE_OK;
443
444         /* There can be only one */
445         if (resource_type(&st->res) == IORESOURCE_MEM)
446                 return AE_ERROR;
447
448         if (acpi_dev_resource_memory(ares, res) ||
449             acpi_dev_resource_address_space(ares, &win)) {
450
451                 if (resource_type(res) != IORESOURCE_MEM ||
452                     resource_size(res) < sizeof(st->clk))
453                         return AE_ERROR;
454
455                 st->res = *res;
456                 return AE_OK;
457         }
458
459         return AE_ERROR;
460 }
461
462 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
463 {
464         struct acpi_device *adev = ACPI_COMPANION(dev);
465         acpi_status status;
466
467         /*
468          * This should never happen as this function is only called when
469          * has_acpi_companion(dev) is true, but the logic is sufficiently
470          * complex that Coverity can't see the tautology.
471          */
472         if (!adev)
473                 return -ENODEV;
474
475         status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
476                                      vmclock_acpi_resources, st);
477         if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
478                 dev_err(dev, "failed to get resources\n");
479                 return -ENODEV;
480         }
481
482         return 0;
483 }
484
485 static void vmclock_put_idx(void *data)
486 {
487         struct vmclock_state *st = data;
488
489         ida_free(&vmclock_ida, st->index);
490 }
491
492 static int vmclock_probe(struct platform_device *pdev)
493 {
494         struct device *dev = &pdev->dev;
495         struct vmclock_state *st;
496         int ret;
497
498         st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
499         if (!st)
500                 return -ENOMEM;
501
502         if (has_acpi_companion(dev))
503                 ret = vmclock_probe_acpi(dev, st);
504         else
505                 ret = -EINVAL; /* Only ACPI for now */
506
507         if (ret) {
508                 dev_info(dev, "Failed to obtain physical address: %d\n", ret);
509                 goto out;
510         }
511
512         if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
513                 dev_info(dev, "Region too small (0x%llx)\n",
514                          resource_size(&st->res));
515                 ret = -EINVAL;
516                 goto out;
517         }
518         st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
519                                 MEMREMAP_WB | MEMREMAP_DEC);
520         if (IS_ERR(st->clk)) {
521                 ret = PTR_ERR(st->clk);
522                 dev_info(dev, "failed to map shared memory\n");
523                 st->clk = NULL;
524                 goto out;
525         }
526
527         if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
528             le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
529             le16_to_cpu(st->clk->version) != 1) {
530                 dev_info(dev, "vmclock magic fields invalid\n");
531                 ret = -EINVAL;
532                 goto out;
533         }
534
535         ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
536         if (ret < 0)
537                 goto out;
538
539         st->index = ret;
540         ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
541         if (ret)
542                 goto out;
543
544         st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
545         if (!st->name) {
546                 ret = -ENOMEM;
547                 goto out;
548         }
549
550         /*
551          * If the structure is big enough, it can be mapped to userspace.
552          * Theoretically a guest OS even using larger pages could still
553          * use 4KiB PTEs to map smaller MMIO regions like this, but let's
554          * cross that bridge if/when we come to it.
555          */
556         if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
557                 st->miscdev.minor = MISC_DYNAMIC_MINOR;
558                 st->miscdev.fops = &vmclock_miscdev_fops;
559                 st->miscdev.name = st->name;
560
561                 ret = misc_register(&st->miscdev);
562                 if (ret)
563                         goto out;
564         }
565
566         /* If there is valid clock information, register a PTP clock */
567         if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
568                 /* Can return a silent NULL, or an error. */
569                 st->ptp_clock = vmclock_ptp_register(dev, st);
570                 if (IS_ERR(st->ptp_clock)) {
571                         ret = PTR_ERR(st->ptp_clock);
572                         st->ptp_clock = NULL;
573                         vmclock_remove(pdev);
574                         goto out;
575                 }
576         }
577
578         if (!st->miscdev.minor && !st->ptp_clock) {
579                 /* Neither miscdev nor PTP registered */
580                 dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
581                 ret = -ENODEV;
582                 goto out;
583         }
584
585         dev_info(dev, "%s: registered %s%s%s\n", st->name,
586                  st->miscdev.minor ? "miscdev" : "",
587                  (st->miscdev.minor && st->ptp_clock) ? ", " : "",
588                  st->ptp_clock ? "PTP" : "");
589
590         dev_set_drvdata(dev, st);
591
592  out:
593         return ret;
594 }
595
596 static const struct acpi_device_id vmclock_acpi_ids[] = {
597         { "AMZNC10C", 0 },
598         {}
599 };
600 MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
601
602 static struct platform_driver vmclock_platform_driver = {
603         .probe          = vmclock_probe,
604         .remove         = vmclock_remove,
605         .driver = {
606                 .name   = "vmclock",
607                 .acpi_match_table = vmclock_acpi_ids,
608         },
609 };
610
611 module_platform_driver(vmclock_platform_driver)
612
613 MODULE_AUTHOR("David Woodhouse <[email protected]>");
614 MODULE_DESCRIPTION("PTP clock using VMCLOCK");
615 MODULE_LICENSE("GPL");
This page took 0.060202 seconds and 4 git commands to generate.