kernel/events/core.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <[email protected]>
   5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <[email protected]>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <[email protected]>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/idr.h>
  17 #include <linux/file.h>
  18 #include <linux/poll.h>
  19 #include <linux/slab.h>
  20 #include <linux/hash.h>
  21 #include <linux/tick.h>
  22 #include <linux/sysfs.h>
  23 #include <linux/dcache.h>
  24 #include <linux/percpu.h>
  25 #include <linux/ptrace.h>
  26 #include <linux/reboot.h>
  27 #include <linux/vmstat.h>
  28 #include <linux/device.h>
  29 #include <linux/export.h>
  30 #include <linux/vmalloc.h>
  31 #include <linux/hardirq.h>
  32 #include <linux/rculist.h>
  33 #include <linux/uaccess.h>
  34 #include <linux/syscalls.h>
  35 #include <linux/anon_inodes.h>
  36 #include <linux/kernel_stat.h>
  37 #include <linux/perf_event.h>
  38 #include <linux/ftrace_event.h>
  39 #include <linux/hw_breakpoint.h>
  40 #include <linux/mm_types.h>
  41 #include <linux/cgroup.h>
  42 #include <linux/module.h>
  43 #include <linux/mman.h>
  44 #include <linux/compat.h>
  45
  46 #include "internal.h"
  47
  48 #include <asm/irq_regs.h>
  49
  50 static struct workqueue_struct *perf_wq;
  51
  52 struct remote_function_call {
  53         struct task_struct      *p;
  54         int                     (*func)(void *info);
  55         void                    *info;
  56         int                     ret;
  57 };
  58
  59 static void remote_function(void *data)
  60 {
  61         struct remote_function_call *tfc = data;
  62         struct task_struct *p = tfc->p;
  63
  64         if (p) {
  65                 tfc->ret = -EAGAIN;
  66                 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
  67                         return;
  68         }
  69
  70         tfc->ret = tfc->func(tfc->info);
  71 }
  72
  73 /**
  74  * task_function_call - call a function on the cpu on which a task runs
  75  * @p:          the task to evaluate
  76  * @func:       the function to be called
  77  * @info:       the function call argument
  78  *
  79  * Calls the function @func when the task is currently running. This might
  80  * be on the current CPU, which just calls the function directly
  81  *
  82  * returns: @func return value, or
  83  *          -ESRCH  - when the process isn't running
  84  *          -EAGAIN - when the process moved away
  85  */
  86 static int
  87 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  88 {
  89         struct remote_function_call data = {
  90                 .p      = p,
  91                 .func   = func,
  92                 .info   = info,
  93                 .ret    = -ESRCH, /* No such (running) process */
  94         };
  95
  96         if (task_curr(p))
  97                 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
  98
  99         return data.ret;
 100 }
 101
 102 /**
 103  * cpu_function_call - call a function on the cpu
 104  * @func:       the function to be called
 105  * @info:       the function call argument
 106  *
 107  * Calls the function @func on the remote cpu.
 108  *
 109  * returns: @func return value or -ENXIO when the cpu is offline
 110  */
 111 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 112 {
 113         struct remote_function_call data = {
 114                 .p      = NULL,
 115                 .func   = func,
 116                 .info   = info,
 117                 .ret    = -ENXIO, /* No such CPU */
 118         };
 119
 120         smp_call_function_single(cpu, remote_function, &data, 1);
 121
 122         return data.ret;
 123 }
 124
 125 #define EVENT_OWNER_KERNEL ((void *) -1)
 126
 127 static bool is_kernel_event(struct perf_event *event)
 128 {
 129         return event->owner == EVENT_OWNER_KERNEL;
 130 }
 131
 132 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 133                        PERF_FLAG_FD_OUTPUT  |\
 134                        PERF_FLAG_PID_CGROUP |\
 135                        PERF_FLAG_FD_CLOEXEC)
 136
 137 /*
 138  * branch priv levels that need permission checks
 139  */
 140 #define PERF_SAMPLE_BRANCH_PERM_PLM \
 141         (PERF_SAMPLE_BRANCH_KERNEL |\
 142          PERF_SAMPLE_BRANCH_HV)
 143
 144 enum event_type_t {
 145         EVENT_FLEXIBLE = 0x1,
 146         EVENT_PINNED = 0x2,
 147         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 148 };
 149
 150 /*
 151  * perf_sched_events : >0 events exist
 152  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 153  */
 154 struct static_key_deferred perf_sched_events __read_mostly;
 155 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 156 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 157
 158 static atomic_t nr_mmap_events __read_mostly;
 159 static atomic_t nr_comm_events __read_mostly;
 160 static atomic_t nr_task_events __read_mostly;
 161 static atomic_t nr_freq_events __read_mostly;
 162
 163 static LIST_HEAD(pmus);
 164 static DEFINE_MUTEX(pmus_lock);
 165 static struct srcu_struct pmus_srcu;
 166
 167 /*
 168  * perf event paranoia level:
 169  *  -1 - not paranoid at all
 170  *   0 - disallow raw tracepoint access for unpriv
 171  *   1 - disallow cpu events for unpriv
 172  *   2 - disallow kernel profiling for unpriv
 173  */
 174 int sysctl_perf_event_paranoid __read_mostly = 1;
 175
 176 /* Minimum for 512 kiB + 1 user control page */
 177 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 178
 179 /*
 180  * max perf event sample rate
 181  */
 182 #define DEFAULT_MAX_SAMPLE_RATE         100000
 183 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 184 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
 185
 186 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 187
 188 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 189 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 190
 191 static int perf_sample_allowed_ns __read_mostly =
 192         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 193
 194 void update_perf_cpu_limits(void)
 195 {
 196         u64 tmp = perf_sample_period_ns;
 197
 198         tmp *= sysctl_perf_cpu_time_max_percent;
 199         do_div(tmp, 100);
 200         ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 201 }
 202
 203 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 204
 205 int perf_proc_update_handler(struct ctl_table *table, int write,
 206                 void __user *buffer, size_t *lenp,
 207                 loff_t *ppos)
 208 {
 209         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 210
 211         if (ret || !write)
 212                 return ret;
 213
 214         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 215         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 216         update_perf_cpu_limits();
 217
 218         return 0;
 219 }
 220
 221 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 222
 223 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 224                                 void __user *buffer, size_t *lenp,
 225                                 loff_t *ppos)
 226 {
 227         int ret = proc_dointvec(table, write, buffer, lenp, ppos);
 228
 229         if (ret || !write)
 230                 return ret;
 231
 232         update_perf_cpu_limits();
 233
 234         return 0;
 235 }
 236
 237 /*
 238  * perf samples are done in some very critical code paths (NMIs).
 239  * If they take too much CPU time, the system can lock up and not
 240  * get any real work done.  This will drop the sample rate when
 241  * we detect that events are taking too long.
 242  */
 243 #define NR_ACCUMULATED_SAMPLES 128
 244 static DEFINE_PER_CPU(u64, running_sample_length);
 245
 246 static void perf_duration_warn(struct irq_work *w)
 247 {
 248         u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 249         u64 avg_local_sample_len;
 250         u64 local_samples_len;
 251
 252         local_samples_len = __this_cpu_read(running_sample_length);
 253         avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 254
 255         printk_ratelimited(KERN_WARNING
 256                         "perf interrupt took too long (%lld > %lld), lowering "
 257                         "kernel.perf_event_max_sample_rate to %d\n",
 258                         avg_local_sample_len, allowed_ns >> 1,
 259                         sysctl_perf_event_sample_rate);
 260 }
 261
 262 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 263
 264 void perf_sample_event_took(u64 sample_len_ns)
 265 {
 266         u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 267         u64 avg_local_sample_len;
 268         u64 local_samples_len;
 269
 270         if (allowed_ns == 0)
 271                 return;
 272
 273         /* decay the counter by 1 average sample */
 274         local_samples_len = __this_cpu_read(running_sample_length);
 275         local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
 276         local_samples_len += sample_len_ns;
 277         __this_cpu_write(running_sample_length, local_samples_len);
 278
 279         /*
 280          * note: this will be biased artifically low until we have
 281          * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
 282          * from having to maintain a count.
 283          */
 284         avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 285
 286         if (avg_local_sample_len <= allowed_ns)
 287                 return;
 288
 289         if (max_samples_per_tick <= 1)
 290                 return;
 291
 292         max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
 293         sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
 294         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 295
 296         update_perf_cpu_limits();
 297
 298         if (!irq_work_queue(&perf_duration_work)) {
 299                 early_printk("perf interrupt took too long (%lld > %lld), lowering "
 300                              "kernel.perf_event_max_sample_rate to %d\n",
 301                              avg_local_sample_len, allowed_ns >> 1,
 302                              sysctl_perf_event_sample_rate);
 303         }
 304 }
 305
 306 static atomic64_t perf_event_id;
 307
 308 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 309                               enum event_type_t event_type);
 310
 311 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 312                              enum event_type_t event_type,
 313                              struct task_struct *task);
 314
 315 static void update_context_time(struct perf_event_context *ctx);
 316 static u64 perf_event_time(struct perf_event *event);
 317
 318 void __weak perf_event_print_debug(void)        { }
 319
 320 extern __weak const char *perf_pmu_name(void)
 321 {
 322         return "pmu";
 323 }
 324
 325 static inline u64 perf_clock(void)
 326 {
 327         return local_clock();
 328 }
 329
 330 static inline struct perf_cpu_context *
 331 __get_cpu_context(struct perf_event_context *ctx)
 332 {
 333         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 334 }
 335
 336 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 337                           struct perf_event_context *ctx)
 338 {
 339         raw_spin_lock(&cpuctx->ctx.lock);
 340         if (ctx)
 341                 raw_spin_lock(&ctx->lock);
 342 }
 343
 344 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 345                             struct perf_event_context *ctx)
 346 {
 347         if (ctx)
 348                 raw_spin_unlock(&ctx->lock);
 349         raw_spin_unlock(&cpuctx->ctx.lock);
 350 }
 351
 352 #ifdef CONFIG_CGROUP_PERF
 353
 354 /*
 355  * perf_cgroup_info keeps track of time_enabled for a cgroup.
 356  * This is a per-cpu dynamically allocated data structure.
 357  */
 358 struct perf_cgroup_info {
 359         u64                             time;
 360         u64                             timestamp;
 361 };
 362
 363 struct perf_cgroup {
 364         struct cgroup_subsys_state      css;
 365         struct perf_cgroup_info __percpu *info;
 366 };
 367
 368 /*
 369  * Must ensure cgroup is pinned (css_get) before calling
 370  * this function. In other words, we cannot call this function
 371  * if there is no cgroup event for the current CPU context.
 372  */
 373 static inline struct perf_cgroup *
 374 perf_cgroup_from_task(struct task_struct *task)
 375 {
 376         return container_of(task_css(task, perf_event_cgrp_id),
 377                             struct perf_cgroup, css);
 378 }
 379
 380 static inline bool
 381 perf_cgroup_match(struct perf_event *event)
 382 {
 383         struct perf_event_context *ctx = event->ctx;
 384         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 385
 386         /* @event doesn't care about cgroup */
 387         if (!event->cgrp)
 388                 return true;
 389
 390         /* wants specific cgroup scope but @cpuctx isn't associated with any */
 391         if (!cpuctx->cgrp)
 392                 return false;
 393
 394         /*
 395          * Cgroup scoping is recursive.  An event enabled for a cgroup is
 396          * also enabled for all its descendant cgroups.  If @cpuctx's
 397          * cgroup is a descendant of @event's (the test covers identity
 398          * case), it's a match.
 399          */
 400         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 401                                     event->cgrp->css.cgroup);
 402 }
 403
 404 static inline void perf_detach_cgroup(struct perf_event *event)
 405 {
 406         css_put(&event->cgrp->css);
 407         event->cgrp = NULL;
 408 }
 409
 410 static inline int is_cgroup_event(struct perf_event *event)
 411 {
 412         return event->cgrp != NULL;
 413 }
 414
 415 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 416 {
 417         struct perf_cgroup_info *t;
 418
 419         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 420         return t->time;
 421 }
 422
 423 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 424 {
 425         struct perf_cgroup_info *info;
 426         u64 now;
 427
 428         now = perf_clock();
 429
 430         info = this_cpu_ptr(cgrp->info);
 431
 432         info->time += now - info->timestamp;
 433         info->timestamp = now;
 434 }
 435
 436 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 437 {
 438         struct perf_cgroup *cgrp_out = cpuctx->cgrp;
 439         if (cgrp_out)
 440                 __update_cgrp_time(cgrp_out);
 441 }
 442
 443 static inline void update_cgrp_time_from_event(struct perf_event *event)
 444 {
 445         struct perf_cgroup *cgrp;
 446
 447         /*
 448          * ensure we access cgroup data only when needed and
 449          * when we know the cgroup is pinned (css_get)
 450          */
 451         if (!is_cgroup_event(event))
 452                 return;
 453
 454         cgrp = perf_cgroup_from_task(current);
 455         /*
 456          * Do not update time when cgroup is not active
 457          */
 458         if (cgrp == event->cgrp)
 459                 __update_cgrp_time(event->cgrp);
 460 }
 461
 462 static inline void
 463 perf_cgroup_set_timestamp(struct task_struct *task,
 464                           struct perf_event_context *ctx)
 465 {
 466         struct perf_cgroup *cgrp;
 467         struct perf_cgroup_info *info;
 468
 469         /*
 470          * ctx->lock held by caller
 471          * ensure we do not access cgroup data
 472          * unless we have the cgroup pinned (css_get)
 473          */
 474         if (!task || !ctx->nr_cgroups)
 475                 return;
 476
 477         cgrp = perf_cgroup_from_task(task);
 478         info = this_cpu_ptr(cgrp->info);
 479         info->timestamp = ctx->timestamp;
 480 }
 481
 482 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 483 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 484
 485 /*
 486  * reschedule events based on the cgroup constraint of task.
 487  *
 488  * mode SWOUT : schedule out everything
 489  * mode SWIN : schedule in based on cgroup for next
 490  */
 491 void perf_cgroup_switch(struct task_struct *task, int mode)
 492 {
 493         struct perf_cpu_context *cpuctx;
 494         struct pmu *pmu;
 495         unsigned long flags;
 496
 497         /*
 498          * disable interrupts to avoid geting nr_cgroup
 499          * changes via __perf_event_disable(). Also
 500          * avoids preemption.
 501          */
 502         local_irq_save(flags);
 503
 504         /*
 505          * we reschedule only in the presence of cgroup
 506          * constrained events.
 507          */
 508         rcu_read_lock();
 509
 510         list_for_each_entry_rcu(pmu, &pmus, entry) {
 511                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 512                 if (cpuctx->unique_pmu != pmu)
 513                         continue; /* ensure we process each cpuctx once */
 514
 515                 /*
 516                  * perf_cgroup_events says at least one
 517                  * context on this CPU has cgroup events.
 518                  *
 519                  * ctx->nr_cgroups reports the number of cgroup
 520                  * events for a context.
 521                  */
 522                 if (cpuctx->ctx.nr_cgroups > 0) {
 523                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 524                         perf_pmu_disable(cpuctx->ctx.pmu);
 525
 526                         if (mode & PERF_CGROUP_SWOUT) {
 527                                 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 528                                 /*
 529                                  * must not be done before ctxswout due
 530                                  * to event_filter_match() in event_sched_out()
 531                                  */
 532                                 cpuctx->cgrp = NULL;
 533                         }
 534
 535                         if (mode & PERF_CGROUP_SWIN) {
 536                                 WARN_ON_ONCE(cpuctx->cgrp);
 537                                 /*
 538                                  * set cgrp before ctxsw in to allow
 539                                  * event_filter_match() to not have to pass
 540                                  * task around
 541                                  */
 542                                 cpuctx->cgrp = perf_cgroup_from_task(task);
 543                                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 544                         }
 545                         perf_pmu_enable(cpuctx->ctx.pmu);
 546                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 547                 }
 548         }
 549
 550         rcu_read_unlock();
 551
 552         local_irq_restore(flags);
 553 }
 554
 555 static inline void perf_cgroup_sched_out(struct task_struct *task,
 556                                          struct task_struct *next)
 557 {
 558         struct perf_cgroup *cgrp1;
 559         struct perf_cgroup *cgrp2 = NULL;
 560
 561         /*
 562          * we come here when we know perf_cgroup_events > 0
 563          */
 564         cgrp1 = perf_cgroup_from_task(task);
 565
 566         /*
 567          * next is NULL when called from perf_event_enable_on_exec()
 568          * that will systematically cause a cgroup_switch()
 569          */
 570         if (next)
 571                 cgrp2 = perf_cgroup_from_task(next);
 572
 573         /*
 574          * only schedule out current cgroup events if we know
 575          * that we are switching to a different cgroup. Otherwise,
 576          * do no touch the cgroup events.
 577          */
 578         if (cgrp1 != cgrp2)
 579                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 580 }
 581
 582 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 583                                         struct task_struct *task)
 584 {
 585         struct perf_cgroup *cgrp1;
 586         struct perf_cgroup *cgrp2 = NULL;
 587
 588         /*
 589          * we come here when we know perf_cgroup_events > 0
 590          */
 591         cgrp1 = perf_cgroup_from_task(task);
 592
 593         /* prev can never be NULL */
 594         cgrp2 = perf_cgroup_from_task(prev);
 595
 596         /*
 597          * only need to schedule in cgroup events if we are changing
 598          * cgroup during ctxsw. Cgroup events were not scheduled
 599          * out of ctxsw out if that was not the case.
 600          */
 601         if (cgrp1 != cgrp2)
 602                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 603 }
 604
 605 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 606                                       struct perf_event_attr *attr,
 607                                       struct perf_event *group_leader)
 608 {
 609         struct perf_cgroup *cgrp;
 610         struct cgroup_subsys_state *css;
 611         struct fd f = fdget(fd);
 612         int ret = 0;
 613
 614         if (!f.file)
 615                 return -EBADF;
 616
 617         css = css_tryget_online_from_dir(f.file->f_path.dentry,
 618                                          &perf_event_cgrp_subsys);
 619         if (IS_ERR(css)) {
 620                 ret = PTR_ERR(css);
 621                 goto out;
 622         }
 623
 624         cgrp = container_of(css, struct perf_cgroup, css);
 625         event->cgrp = cgrp;
 626
 627         /*
 628          * all events in a group must monitor
 629          * the same cgroup because a task belongs
 630          * to only one perf cgroup at a time
 631          */
 632         if (group_leader && group_leader->cgrp != cgrp) {
 633                 perf_detach_cgroup(event);
 634                 ret = -EINVAL;
 635         }
 636 out:
 637         fdput(f);
 638         return ret;
 639 }
 640
 641 static inline void
 642 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 643 {
 644         struct perf_cgroup_info *t;
 645         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 646         event->shadow_ctx_time = now - t->timestamp;
 647 }
 648
 649 static inline void
 650 perf_cgroup_defer_enabled(struct perf_event *event)
 651 {
 652         /*
 653          * when the current task's perf cgroup does not match
 654          * the event's, we need to remember to call the
 655          * perf_mark_enable() function the first time a task with
 656          * a matching perf cgroup is scheduled in.
 657          */
 658         if (is_cgroup_event(event) && !perf_cgroup_match(event))
 659                 event->cgrp_defer_enabled = 1;
 660 }
 661
 662 static inline void
 663 perf_cgroup_mark_enabled(struct perf_event *event,
 664                          struct perf_event_context *ctx)
 665 {
 666         struct perf_event *sub;
 667         u64 tstamp = perf_event_time(event);
 668
 669         if (!event->cgrp_defer_enabled)
 670                 return;
 671
 672         event->cgrp_defer_enabled = 0;
 673
 674         event->tstamp_enabled = tstamp - event->total_time_enabled;
 675         list_for_each_entry(sub, &event->sibling_list, group_entry) {
 676                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 677                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 678                         sub->cgrp_defer_enabled = 0;
 679                 }
 680         }
 681 }
 682 #else /* !CONFIG_CGROUP_PERF */
 683
 684 static inline bool
 685 perf_cgroup_match(struct perf_event *event)
 686 {
 687         return true;
 688 }
 689
 690 static inline void perf_detach_cgroup(struct perf_event *event)
 691 {}
 692
 693 static inline int is_cgroup_event(struct perf_event *event)
 694 {
 695         return 0;
 696 }
 697
 698 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
 699 {
 700         return 0;
 701 }
 702
 703 static inline void update_cgrp_time_from_event(struct perf_event *event)
 704 {
 705 }
 706
 707 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 708 {
 709 }
 710
 711 static inline void perf_cgroup_sched_out(struct task_struct *task,
 712                                          struct task_struct *next)
 713 {
 714 }
 715
 716 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 717                                         struct task_struct *task)
 718 {
 719 }
 720
 721 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 722                                       struct perf_event_attr *attr,
 723                                       struct perf_event *group_leader)
 724 {
 725         return -EINVAL;
 726 }
 727
 728 static inline void
 729 perf_cgroup_set_timestamp(struct task_struct *task,
 730                           struct perf_event_context *ctx)
 731 {
 732 }
 733
 734 void
 735 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
 736 {
 737 }
 738
 739 static inline void
 740 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 741 {
 742 }
 743
 744 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 745 {
 746         return 0;
 747 }
 748
 749 static inline void
 750 perf_cgroup_defer_enabled(struct perf_event *event)
 751 {
 752 }
 753
 754 static inline void
 755 perf_cgroup_mark_enabled(struct perf_event *event,
 756                          struct perf_event_context *ctx)
 757 {
 758 }
 759 #endif
 760
 761 /*
 762  * set default to be dependent on timer tick just
 763  * like original code
 764  */
 765 #define PERF_CPU_HRTIMER (1000 / HZ)
 766 /*
 767  * function must be called with interrupts disbled
 768  */
 769 static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
 770 {
 771         struct perf_cpu_context *cpuctx;
 772         enum hrtimer_restart ret = HRTIMER_NORESTART;
 773         int rotations = 0;
 774
 775         WARN_ON(!irqs_disabled());
 776
 777         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
 778
 779         rotations = perf_rotate_context(cpuctx);
 780
 781         /*
 782          * arm timer if needed
 783          */
 784         if (rotations) {
 785                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
 786                 ret = HRTIMER_RESTART;
 787         }
 788
 789         return ret;
 790 }
 791
 792 /* CPU is going down */
 793 void perf_cpu_hrtimer_cancel(int cpu)
 794 {
 795         struct perf_cpu_context *cpuctx;
 796         struct pmu *pmu;
 797         unsigned long flags;
 798
 799         if (WARN_ON(cpu != smp_processor_id()))
 800                 return;
 801
 802         local_irq_save(flags);
 803
 804         rcu_read_lock();
 805
 806         list_for_each_entry_rcu(pmu, &pmus, entry) {
 807                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 808
 809                 if (pmu->task_ctx_nr == perf_sw_context)
 810                         continue;
 811
 812                 hrtimer_cancel(&cpuctx->hrtimer);
 813         }
 814
 815         rcu_read_unlock();
 816
 817         local_irq_restore(flags);
 818 }
 819
 820 static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 821 {
 822         struct hrtimer *hr = &cpuctx->hrtimer;
 823         struct pmu *pmu = cpuctx->ctx.pmu;
 824         int timer;
 825
 826         /* no multiplexing needed for SW PMU */
 827         if (pmu->task_ctx_nr == perf_sw_context)
 828                 return;
 829
 830         /*
 831          * check default is sane, if not set then force to
 832          * default interval (1/tick)
 833          */
 834         timer = pmu->hrtimer_interval_ms;
 835         if (timer < 1)
 836                 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 837
 838         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 839
 840         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 841         hr->function = perf_cpu_hrtimer_handler;
 842 }
 843
 844 static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
 845 {
 846         struct hrtimer *hr = &cpuctx->hrtimer;
 847         struct pmu *pmu = cpuctx->ctx.pmu;
 848
 849         /* not for SW PMU */
 850         if (pmu->task_ctx_nr == perf_sw_context)
 851                 return;
 852
 853         if (hrtimer_active(hr))
 854                 return;
 855
 856         if (!hrtimer_callback_running(hr))
 857                 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
 858                                          0, HRTIMER_MODE_REL_PINNED, 0);
 859 }
 860
 861 void perf_pmu_disable(struct pmu *pmu)
 862 {
 863         int *count = this_cpu_ptr(pmu->pmu_disable_count);
 864         if (!(*count)++)
 865                 pmu->pmu_disable(pmu);
 866 }
 867
 868 void perf_pmu_enable(struct pmu *pmu)
 869 {
 870         int *count = this_cpu_ptr(pmu->pmu_disable_count);
 871         if (!--(*count))
 872                 pmu->pmu_enable(pmu);
 873 }
 874
 875 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 876
 877 /*
 878  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
 879  * perf_event_task_tick() are fully serialized because they're strictly cpu
 880  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
 881  * disabled, while perf_event_task_tick is called from IRQ context.
 882  */
 883 static void perf_event_ctx_activate(struct perf_event_context *ctx)
 884 {
 885         struct list_head *head = this_cpu_ptr(&active_ctx_list);
 886
 887         WARN_ON(!irqs_disabled());
 888
 889         WARN_ON(!list_empty(&ctx->active_ctx_list));
 890
 891         list_add(&ctx->active_ctx_list, head);
 892 }
 893
 894 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
 895 {
 896         WARN_ON(!irqs_disabled());
 897
 898         WARN_ON(list_empty(&ctx->active_ctx_list));
 899
 900         list_del_init(&ctx->active_ctx_list);
 901 }
 902
 903 static void get_ctx(struct perf_event_context *ctx)
 904 {
 905         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 906 }
 907
 908 static void free_ctx(struct rcu_head *head)
 909 {
 910         struct perf_event_context *ctx;
 911
 912         ctx = container_of(head, struct perf_event_context, rcu_head);
 913         kfree(ctx->task_ctx_data);
 914         kfree(ctx);
 915 }
 916
 917 static void put_ctx(struct perf_event_context *ctx)
 918 {
 919         if (atomic_dec_and_test(&ctx->refcount)) {
 920                 if (ctx->parent_ctx)
 921                         put_ctx(ctx->parent_ctx);
 922                 if (ctx->task)
 923                         put_task_struct(ctx->task);
 924                 call_rcu(&ctx->rcu_head, free_ctx);
 925         }
 926 }
 927
 928 /*
 929  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 930  * perf_pmu_migrate_context() we need some magic.
 931  *
 932  * Those places that change perf_event::ctx will hold both
 933  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 934  *
 935  * Lock ordering is by mutex address. There is one other site where
 936  * perf_event_context::mutex nests and that is put_event(). But remember that
 937  * that is a parent<->child context relation, and migration does not affect
 938  * children, therefore these two orderings should not interact.
 939  *
 940  * The change in perf_event::ctx does not affect children (as claimed above)
 941  * because the sys_perf_event_open() case will install a new event and break
 942  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 943  * concerned with cpuctx and that doesn't have children.
 944  *
 945  * The places that change perf_event::ctx will issue:
 946  *
 947  *   perf_remove_from_context();
 948  *   synchronize_rcu();
 949  *   perf_install_in_context();
 950  *
 951  * to affect the change. The remove_from_context() + synchronize_rcu() should
 952  * quiesce the event, after which we can install it in the new location. This
 953  * means that only external vectors (perf_fops, prctl) can perturb the event
 954  * while in transit. Therefore all such accessors should also acquire
 955  * perf_event_context::mutex to serialize against this.
 956  *
 957  * However; because event->ctx can change while we're waiting to acquire
 958  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 959  * function.
 960  *
 961  * Lock order:
 962  *      task_struct::perf_event_mutex
 963  *        perf_event_context::mutex
 964  *          perf_event_context::lock
 965  *          perf_event::child_mutex;
 966  *          perf_event::mmap_mutex
 967  *          mmap_sem
 968  */
 969 static struct perf_event_context *
 970 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
 971 {
 972         struct perf_event_context *ctx;
 973
 974 again:
 975         rcu_read_lock();
 976         ctx = ACCESS_ONCE(event->ctx);
 977         if (!atomic_inc_not_zero(&ctx->refcount)) {
 978                 rcu_read_unlock();
 979                 goto again;
 980         }
 981         rcu_read_unlock();
 982
 983         mutex_lock_nested(&ctx->mutex, nesting);
 984         if (event->ctx != ctx) {
 985                 mutex_unlock(&ctx->mutex);
 986                 put_ctx(ctx);
 987                 goto again;
 988         }
 989
 990         return ctx;
 991 }
 992
 993 static inline struct perf_event_context *
 994 perf_event_ctx_lock(struct perf_event *event)
 995 {
 996         return perf_event_ctx_lock_nested(event, 0);
 997 }
 998
 999 static void perf_event_ctx_unlock(struct perf_event *event,
1000                                   struct perf_event_context *ctx)
1001 {
1002         mutex_unlock(&ctx->mutex);
1003         put_ctx(ctx);
1004 }
1005
1006 /*
1007  * This must be done under the ctx->lock, such as to serialize against
1008  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1009  * calling scheduler related locks and ctx->lock nests inside those.
1010  */
1011 static __must_check struct perf_event_context *
1012 unclone_ctx(struct perf_event_context *ctx)
1013 {
1014         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1015
1016         lockdep_assert_held(&ctx->lock);
1017
1018         if (parent_ctx)
1019                 ctx->parent_ctx = NULL;
1020         ctx->generation++;
1021
1022         return parent_ctx;
1023 }
1024
1025 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1026 {
1027         /*
1028          * only top level events have the pid namespace they were created in
1029          */
1030         if (event->parent)
1031                 event = event->parent;
1032
1033         return task_tgid_nr_ns(p, event->ns);
1034 }
1035
1036 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1037 {
1038         /*
1039          * only top level events have the pid namespace they were created in
1040          */
1041         if (event->parent)
1042                 event = event->parent;
1043
1044         return task_pid_nr_ns(p, event->ns);
1045 }
1046
1047 /*
1048  * If we inherit events we want to return the parent event id
1049  * to userspace.
1050  */
1051 static u64 primary_event_id(struct perf_event *event)
1052 {
1053         u64 id = event->id;
1054
1055         if (event->parent)
1056                 id = event->parent->id;
1057
1058         return id;
1059 }
1060
1061 /*
1062  * Get the perf_event_context for a task and lock it.
1063  * This has to cope with with the fact that until it is locked,
1064  * the context could get moved to another task.
1065  */
1066 static struct perf_event_context *
1067 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1068 {
1069         struct perf_event_context *ctx;
1070
1071 retry:
1072         /*
1073          * One of the few rules of preemptible RCU is that one cannot do
1074          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1075          * part of the read side critical section was preemptible -- see
1076          * rcu_read_unlock_special().
1077          *
1078          * Since ctx->lock nests under rq->lock we must ensure the entire read
1079          * side critical section is non-preemptible.
1080          */
1081         preempt_disable();
1082         rcu_read_lock();
1083         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1084         if (ctx) {
1085                 /*
1086                  * If this context is a clone of another, it might
1087                  * get swapped for another underneath us by
1088                  * perf_event_task_sched_out, though the
1089                  * rcu_read_lock() protects us from any context
1090                  * getting freed.  Lock the context and check if it
1091                  * got swapped before we could get the lock, and retry
1092                  * if so.  If we locked the right context, then it
1093                  * can't get swapped on us any more.
1094                  */
1095                 raw_spin_lock_irqsave(&ctx->lock, *flags);
1096                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1097                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1098                         rcu_read_unlock();
1099                         preempt_enable();
1100                         goto retry;
1101                 }
1102
1103                 if (!atomic_inc_not_zero(&ctx->refcount)) {
1104                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1105                         ctx = NULL;
1106                 }
1107         }
1108         rcu_read_unlock();
1109         preempt_enable();
1110         return ctx;
1111 }
1112
1113 /*
1114  * Get the context for a task and increment its pin_count so it
1115  * can't get swapped to another task.  This also increments its
1116  * reference count so that the context can't get freed.
1117  */
1118 static struct perf_event_context *
1119 perf_pin_task_context(struct task_struct *task, int ctxn)
1120 {
1121         struct perf_event_context *ctx;
1122         unsigned long flags;
1123
1124         ctx = perf_lock_task_context(task, ctxn, &flags);
1125         if (ctx) {
1126                 ++ctx->pin_count;
1127                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1128         }
1129         return ctx;
1130 }
1131
1132 static void perf_unpin_context(struct perf_event_context *ctx)
1133 {
1134         unsigned long flags;
1135
1136         raw_spin_lock_irqsave(&ctx->lock, flags);
1137         --ctx->pin_count;
1138         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1139 }
1140
1141 /*
1142  * Update the record of the current time in a context.
1143  */
1144 static void update_context_time(struct perf_event_context *ctx)
1145 {
1146         u64 now = perf_clock();
1147
1148         ctx->time += now - ctx->timestamp;
1149         ctx->timestamp = now;
1150 }
1151
1152 static u64 perf_event_time(struct perf_event *event)
1153 {
1154         struct perf_event_context *ctx = event->ctx;
1155
1156         if (is_cgroup_event(event))
1157                 return perf_cgroup_event_time(event);
1158
1159         return ctx ? ctx->time : 0;
1160 }
1161
1162 /*
1163  * Update the total_time_enabled and total_time_running fields for a event.
1164  * The caller of this function needs to hold the ctx->lock.
1165  */
1166 static void update_event_times(struct perf_event *event)
1167 {
1168         struct perf_event_context *ctx = event->ctx;
1169         u64 run_end;
1170
1171         if (event->state < PERF_EVENT_STATE_INACTIVE ||
1172             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1173                 return;
1174         /*
1175          * in cgroup mode, time_enabled represents
1176          * the time the event was enabled AND active
1177          * tasks were in the monitored cgroup. This is
1178          * independent of the activity of the context as
1179          * there may be a mix of cgroup and non-cgroup events.
1180          *
1181          * That is why we treat cgroup events differently
1182          * here.
1183          */
1184         if (is_cgroup_event(event))
1185                 run_end = perf_cgroup_event_time(event);
1186         else if (ctx->is_active)
1187                 run_end = ctx->time;
1188         else
1189                 run_end = event->tstamp_stopped;
1190
1191         event->total_time_enabled = run_end - event->tstamp_enabled;
1192
1193         if (event->state == PERF_EVENT_STATE_INACTIVE)
1194                 run_end = event->tstamp_stopped;
1195         else
1196                 run_end = perf_event_time(event);
1197
1198         event->total_time_running = run_end - event->tstamp_running;
1199
1200 }
1201
1202 /*
1203  * Update total_time_enabled and total_time_running for all events in a group.
1204  */
1205 static void update_group_times(struct perf_event *leader)
1206 {
1207         struct perf_event *event;
1208
1209         update_event_times(leader);
1210         list_for_each_entry(event, &leader->sibling_list, group_entry)
1211                 update_event_times(event);
1212 }
1213
1214 static struct list_head *
1215 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1216 {
1217         if (event->attr.pinned)
1218                 return &ctx->pinned_groups;
1219         else
1220                 return &ctx->flexible_groups;
1221 }
1222
1223 /*
1224  * Add a event from the lists for its context.
1225  * Must be called with ctx->mutex and ctx->lock held.
1226  */
1227 static void
1228 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1229 {
1230         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1231         event->attach_state |= PERF_ATTACH_CONTEXT;
1232
1233         /*
1234          * If we're a stand alone event or group leader, we go to the context
1235          * list, group events are kept attached to the group so that
1236          * perf_group_detach can, at all times, locate all siblings.
1237          */
1238         if (event->group_leader == event) {
1239                 struct list_head *list;
1240
1241                 if (is_software_event(event))
1242                         event->group_flags |= PERF_GROUP_SOFTWARE;
1243
1244                 list = ctx_group_list(event, ctx);
1245                 list_add_tail(&event->group_entry, list);
1246         }
1247
1248         if (is_cgroup_event(event))
1249                 ctx->nr_cgroups++;
1250
1251         list_add_rcu(&event->event_entry, &ctx->event_list);
1252         ctx->nr_events++;
1253         if (event->attr.inherit_stat)
1254                 ctx->nr_stat++;
1255
1256         ctx->generation++;
1257 }
1258
1259 /*
1260  * Initialize event state based on the perf_event_attr::disabled.
1261  */
1262 static inline void perf_event__state_init(struct perf_event *event)
1263 {
1264         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1265                                               PERF_EVENT_STATE_INACTIVE;
1266 }
1267
1268 /*
1269  * Called at perf_event creation and when events are attached/detached from a
1270  * group.
1271  */
1272 static void perf_event__read_size(struct perf_event *event)
1273 {
1274         int entry = sizeof(u64); /* value */
1275         int size = 0;
1276         int nr = 1;
1277
1278         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1279                 size += sizeof(u64);
1280
1281         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1282                 size += sizeof(u64);
1283
1284         if (event->attr.read_format & PERF_FORMAT_ID)
1285                 entry += sizeof(u64);
1286
1287         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1288                 nr += event->group_leader->nr_siblings;
1289                 size += sizeof(u64);
1290         }
1291
1292         size += entry * nr;
1293         event->read_size = size;
1294 }
1295
1296 static void perf_event__header_size(struct perf_event *event)
1297 {
1298         struct perf_sample_data *data;
1299         u64 sample_type = event->attr.sample_type;
1300         u16 size = 0;
1301
1302         perf_event__read_size(event);
1303
1304         if (sample_type & PERF_SAMPLE_IP)
1305                 size += sizeof(data->ip);
1306
1307         if (sample_type & PERF_SAMPLE_ADDR)
1308                 size += sizeof(data->addr);
1309
1310         if (sample_type & PERF_SAMPLE_PERIOD)
1311                 size += sizeof(data->period);
1312
1313         if (sample_type & PERF_SAMPLE_WEIGHT)
1314                 size += sizeof(data->weight);
1315
1316         if (sample_type & PERF_SAMPLE_READ)
1317                 size += event->read_size;
1318
1319         if (sample_type & PERF_SAMPLE_DATA_SRC)
1320                 size += sizeof(data->data_src.val);
1321
1322         if (sample_type & PERF_SAMPLE_TRANSACTION)
1323                 size += sizeof(data->txn);
1324
1325         event->header_size = size;
1326 }
1327
1328 static void perf_event__id_header_size(struct perf_event *event)
1329 {
1330         struct perf_sample_data *data;
1331         u64 sample_type = event->attr.sample_type;
1332         u16 size = 0;
1333
1334         if (sample_type & PERF_SAMPLE_TID)
1335                 size += sizeof(data->tid_entry);
1336
1337         if (sample_type & PERF_SAMPLE_TIME)
1338                 size += sizeof(data->time);
1339
1340         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1341                 size += sizeof(data->id);
1342
1343         if (sample_type & PERF_SAMPLE_ID)
1344                 size += sizeof(data->id);
1345
1346         if (sample_type & PERF_SAMPLE_STREAM_ID)
1347                 size += sizeof(data->stream_id);
1348
1349         if (sample_type & PERF_SAMPLE_CPU)
1350                 size += sizeof(data->cpu_entry);
1351
1352         event->id_header_size = size;
1353 }
1354
1355 static void perf_group_attach(struct perf_event *event)
1356 {
1357         struct perf_event *group_leader = event->group_leader, *pos;
1358
1359         /*
1360          * We can have double attach due to group movement in perf_event_open.
1361          */
1362         if (event->attach_state & PERF_ATTACH_GROUP)
1363                 return;
1364
1365         event->attach_state |= PERF_ATTACH_GROUP;
1366
1367         if (group_leader == event)
1368                 return;
1369
1370         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1371
1372         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1373                         !is_software_event(event))
1374                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1375
1376         list_add_tail(&event->group_entry, &group_leader->sibling_list);
1377         group_leader->nr_siblings++;
1378
1379         perf_event__header_size(group_leader);
1380
1381         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1382                 perf_event__header_size(pos);
1383 }
1384
1385 /*
1386  * Remove a event from the lists for its context.
1387  * Must be called with ctx->mutex and ctx->lock held.
1388  */
1389 static void
1390 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1391 {
1392         struct perf_cpu_context *cpuctx;
1393
1394         WARN_ON_ONCE(event->ctx != ctx);
1395         lockdep_assert_held(&ctx->lock);
1396
1397         /*
1398          * We can have double detach due to exit/hot-unplug + close.
1399          */
1400         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1401                 return;
1402
1403         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1404
1405         if (is_cgroup_event(event)) {
1406                 ctx->nr_cgroups--;
1407                 cpuctx = __get_cpu_context(ctx);
1408                 /*
1409                  * if there are no more cgroup events
1410                  * then cler cgrp to avoid stale pointer
1411                  * in update_cgrp_time_from_cpuctx()
1412                  */
1413                 if (!ctx->nr_cgroups)
1414                         cpuctx->cgrp = NULL;
1415         }
1416
1417         ctx->nr_events--;
1418         if (event->attr.inherit_stat)
1419                 ctx->nr_stat--;
1420
1421         list_del_rcu(&event->event_entry);
1422
1423         if (event->group_leader == event)
1424                 list_del_init(&event->group_entry);
1425
1426         update_group_times(event);
1427
1428         /*
1429          * If event was in error state, then keep it
1430          * that way, otherwise bogus counts will be
1431          * returned on read(). The only way to get out
1432          * of error state is by explicit re-enabling
1433          * of the event
1434          */
1435         if (event->state > PERF_EVENT_STATE_OFF)
1436                 event->state = PERF_EVENT_STATE_OFF;
1437
1438         ctx->generation++;
1439 }
1440
1441 static void perf_group_detach(struct perf_event *event)
1442 {
1443         struct perf_event *sibling, *tmp;
1444         struct list_head *list = NULL;
1445
1446         /*
1447          * We can have double detach due to exit/hot-unplug + close.
1448          */
1449         if (!(event->attach_state & PERF_ATTACH_GROUP))
1450                 return;
1451
1452         event->attach_state &= ~PERF_ATTACH_GROUP;
1453
1454         /*
1455          * If this is a sibling, remove it from its group.
1456          */
1457         if (event->group_leader != event) {
1458                 list_del_init(&event->group_entry);
1459                 event->group_leader->nr_siblings--;
1460                 goto out;
1461         }
1462
1463         if (!list_empty(&event->group_entry))
1464                 list = &event->group_entry;
1465
1466         /*
1467          * If this was a group event with sibling events then
1468          * upgrade the siblings to singleton events by adding them
1469          * to whatever list we are on.
1470          */
1471         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1472                 if (list)
1473                         list_move_tail(&sibling->group_entry, list);
1474                 sibling->group_leader = sibling;
1475
1476                 /* Inherit group flags from the previous leader */
1477                 sibling->group_flags = event->group_flags;
1478
1479                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1480         }
1481
1482 out:
1483         perf_event__header_size(event->group_leader);
1484
1485         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1486                 perf_event__header_size(tmp);
1487 }
1488
1489 /*
1490  * User event without the task.
1491  */
1492 static bool is_orphaned_event(struct perf_event *event)
1493 {
1494         return event && !is_kernel_event(event) && !event->owner;
1495 }
1496
1497 /*
1498  * Event has a parent but parent's task finished and it's
1499  * alive only because of children holding refference.
1500  */
1501 static bool is_orphaned_child(struct perf_event *event)
1502 {
1503         return is_orphaned_event(event->parent);
1504 }
1505
1506 static void orphans_remove_work(struct work_struct *work);
1507
1508 static void schedule_orphans_remove(struct perf_event_context *ctx)
1509 {
1510         if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1511                 return;
1512
1513         if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1514                 get_ctx(ctx);
1515                 ctx->orphans_remove_sched = true;
1516         }
1517 }
1518
1519 static int __init perf_workqueue_init(void)
1520 {
1521         perf_wq = create_singlethread_workqueue("perf");
1522         WARN(!perf_wq, "failed to create perf workqueue\n");
1523         return perf_wq ? 0 : -1;
1524 }
1525
1526 core_initcall(perf_workqueue_init);
1527
1528 static inline int
1529 event_filter_match(struct perf_event *event)
1530 {
1531         return (event->cpu == -1 || event->cpu == smp_processor_id())
1532             && perf_cgroup_match(event);
1533 }
1534
1535 static void
1536 event_sched_out(struct perf_event *event,
1537                   struct perf_cpu_context *cpuctx,
1538                   struct perf_event_context *ctx)
1539 {
1540         u64 tstamp = perf_event_time(event);
1541         u64 delta;
1542
1543         WARN_ON_ONCE(event->ctx != ctx);
1544         lockdep_assert_held(&ctx->lock);
1545
1546         /*
1547          * An event which could not be activated because of
1548          * filter mismatch still needs to have its timings
1549          * maintained, otherwise bogus information is return
1550          * via read() for time_enabled, time_running:
1551          */
1552         if (event->state == PERF_EVENT_STATE_INACTIVE
1553             && !event_filter_match(event)) {
1554                 delta = tstamp - event->tstamp_stopped;
1555                 event->tstamp_running += delta;
1556                 event->tstamp_stopped = tstamp;
1557         }
1558
1559         if (event->state != PERF_EVENT_STATE_ACTIVE)
1560                 return;
1561
1562         perf_pmu_disable(event->pmu);
1563
1564         event->state = PERF_EVENT_STATE_INACTIVE;
1565         if (event->pending_disable) {
1566                 event->pending_disable = 0;
1567                 event->state = PERF_EVENT_STATE_OFF;
1568         }
1569         event->tstamp_stopped = tstamp;
1570         event->pmu->del(event, 0);
1571         event->oncpu = -1;
1572
1573         if (!is_software_event(event))
1574                 cpuctx->active_oncpu--;
1575         if (!--ctx->nr_active)
1576                 perf_event_ctx_deactivate(ctx);
1577         if (event->attr.freq && event->attr.sample_freq)
1578                 ctx->nr_freq--;
1579         if (event->attr.exclusive || !cpuctx->active_oncpu)
1580                 cpuctx->exclusive = 0;
1581
1582         if (is_orphaned_child(event))
1583                 schedule_orphans_remove(ctx);
1584
1585         perf_pmu_enable(event->pmu);
1586 }
1587
1588 static void
1589 group_sched_out(struct perf_event *group_event,
1590                 struct perf_cpu_context *cpuctx,
1591                 struct perf_event_context *ctx)
1592 {
1593         struct perf_event *event;
1594         int state = group_event->state;
1595
1596         event_sched_out(group_event, cpuctx, ctx);
1597
1598         /*
1599          * Schedule out siblings (if any):
1600          */
1601         list_for_each_entry(event, &group_event->sibling_list, group_entry)
1602                 event_sched_out(event, cpuctx, ctx);
1603
1604         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1605                 cpuctx->exclusive = 0;
1606 }
1607
1608 struct remove_event {
1609         struct perf_event *event;
1610         bool detach_group;
1611 };
1612
1613 /*
1614  * Cross CPU call to remove a performance event
1615  *
1616  * We disable the event on the hardware level first. After that we
1617  * remove it from the context list.
1618  */
1619 static int __perf_remove_from_context(void *info)
1620 {
1621         struct remove_event *re = info;
1622         struct perf_event *event = re->event;
1623         struct perf_event_context *ctx = event->ctx;
1624         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1625
1626         raw_spin_lock(&ctx->lock);
1627         event_sched_out(event, cpuctx, ctx);
1628         if (re->detach_group)
1629                 perf_group_detach(event);
1630         list_del_event(event, ctx);
1631         if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1632                 ctx->is_active = 0;
1633                 cpuctx->task_ctx = NULL;
1634         }
1635         raw_spin_unlock(&ctx->lock);
1636
1637         return 0;
1638 }
1639
1640
1641 /*
1642  * Remove the event from a task's (or a CPU's) list of events.
1643  *
1644  * CPU events are removed with a smp call. For task events we only
1645  * call when the task is on a CPU.
1646  *
1647  * If event->ctx is a cloned context, callers must make sure that
1648  * every task struct that event->ctx->task could possibly point to
1649  * remains valid.  This is OK when called from perf_release since
1650  * that only calls us on the top-level context, which can't be a clone.
1651  * When called from perf_event_exit_task, it's OK because the
1652  * context has been detached from its task.
1653  */
1654 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1655 {
1656         struct perf_event_context *ctx = event->ctx;
1657         struct task_struct *task = ctx->task;
1658         struct remove_event re = {
1659                 .event = event,
1660                 .detach_group = detach_group,
1661         };
1662
1663         lockdep_assert_held(&ctx->mutex);
1664
1665         if (!task) {
1666                 /*
1667                  * Per cpu events are removed via an smp call. The removal can
1668                  * fail if the CPU is currently offline, but in that case we
1669                  * already called __perf_remove_from_context from
1670                  * perf_event_exit_cpu.
1671                  */
1672                 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1673                 return;
1674         }
1675
1676 retry:
1677         if (!task_function_call(task, __perf_remove_from_context, &re))
1678                 return;
1679
1680         raw_spin_lock_irq(&ctx->lock);
1681         /*
1682          * If we failed to find a running task, but find the context active now
1683          * that we've acquired the ctx->lock, retry.
1684          */
1685         if (ctx->is_active) {
1686                 raw_spin_unlock_irq(&ctx->lock);
1687                 /*
1688                  * Reload the task pointer, it might have been changed by
1689                  * a concurrent perf_event_context_sched_out().
1690                  */
1691                 task = ctx->task;
1692                 goto retry;
1693         }
1694
1695         /*
1696          * Since the task isn't running, its safe to remove the event, us
1697          * holding the ctx->lock ensures the task won't get scheduled in.
1698          */
1699         if (detach_group)
1700                 perf_group_detach(event);
1701         list_del_event(event, ctx);
1702         raw_spin_unlock_irq(&ctx->lock);
1703 }
1704
1705 /*
1706  * Cross CPU call to disable a performance event
1707  */
1708 int __perf_event_disable(void *info)
1709 {
1710         struct perf_event *event = info;
1711         struct perf_event_context *ctx = event->ctx;
1712         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1713
1714         /*
1715          * If this is a per-task event, need to check whether this
1716          * event's task is the current task on this cpu.
1717          *
1718          * Can trigger due to concurrent perf_event_context_sched_out()
1719          * flipping contexts around.
1720          */
1721         if (ctx->task && cpuctx->task_ctx != ctx)
1722                 return -EINVAL;
1723
1724         raw_spin_lock(&ctx->lock);
1725
1726         /*
1727          * If the event is on, turn it off.
1728          * If it is in error state, leave it in error state.
1729          */
1730         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1731                 update_context_time(ctx);
1732                 update_cgrp_time_from_event(event);
1733                 update_group_times(event);
1734                 if (event == event->group_leader)
1735                         group_sched_out(event, cpuctx, ctx);
1736                 else
1737                         event_sched_out(event, cpuctx, ctx);
1738                 event->state = PERF_EVENT_STATE_OFF;
1739         }
1740
1741         raw_spin_unlock(&ctx->lock);
1742
1743         return 0;
1744 }
1745
1746 /*
1747  * Disable a event.
1748  *
1749  * If event->ctx is a cloned context, callers must make sure that
1750  * every task struct that event->ctx->task could possibly point to
1751  * remains valid.  This condition is satisifed when called through
1752  * perf_event_for_each_child or perf_event_for_each because they
1753  * hold the top-level event's child_mutex, so any descendant that
1754  * goes to exit will block in sync_child_event.
1755  * When called from perf_pending_event it's OK because event->ctx
1756  * is the current context on this CPU and preemption is disabled,
1757  * hence we can't get into perf_event_task_sched_out for this context.
1758  */
1759 static void _perf_event_disable(struct perf_event *event)
1760 {
1761         struct perf_event_context *ctx = event->ctx;
1762         struct task_struct *task = ctx->task;
1763
1764         if (!task) {
1765                 /*
1766                  * Disable the event on the cpu that it's on
1767                  */
1768                 cpu_function_call(event->cpu, __perf_event_disable, event);
1769                 return;
1770         }
1771
1772 retry:
1773         if (!task_function_call(task, __perf_event_disable, event))
1774                 return;
1775
1776         raw_spin_lock_irq(&ctx->lock);
1777         /*
1778          * If the event is still active, we need to retry the cross-call.
1779          */
1780         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1781                 raw_spin_unlock_irq(&ctx->lock);
1782                 /*
1783                  * Reload the task pointer, it might have been changed by
1784                  * a concurrent perf_event_context_sched_out().
1785                  */
1786                 task = ctx->task;
1787                 goto retry;
1788         }
1789
1790         /*
1791          * Since we have the lock this context can't be scheduled
1792          * in, so we can change the state safely.
1793          */
1794         if (event->state == PERF_EVENT_STATE_INACTIVE) {
1795                 update_group_times(event);
1796                 event->state = PERF_EVENT_STATE_OFF;
1797         }
1798         raw_spin_unlock_irq(&ctx->lock);
1799 }
1800
1801 /*
1802  * Strictly speaking kernel users cannot create groups and therefore this
1803  * interface does not need the perf_event_ctx_lock() magic.
1804  */
1805 void perf_event_disable(struct perf_event *event)
1806 {
1807         struct perf_event_context *ctx;
1808
1809         ctx = perf_event_ctx_lock(event);
1810         _perf_event_disable(event);
1811         perf_event_ctx_unlock(event, ctx);
1812 }
1813 EXPORT_SYMBOL_GPL(perf_event_disable);
1814
1815 static void perf_set_shadow_time(struct perf_event *event,
1816                                  struct perf_event_context *ctx,
1817                                  u64 tstamp)
1818 {
1819         /*
1820          * use the correct time source for the time snapshot
1821          *
1822          * We could get by without this by leveraging the
1823          * fact that to get to this function, the caller
1824          * has most likely already called update_context_time()
1825          * and update_cgrp_time_xx() and thus both timestamp
1826          * are identical (or very close). Given that tstamp is,
1827          * already adjusted for cgroup, we could say that:
1828          *    tstamp - ctx->timestamp
1829          * is equivalent to
1830          *    tstamp - cgrp->timestamp.
1831          *
1832          * Then, in perf_output_read(), the calculation would
1833          * work with no changes because:
1834          * - event is guaranteed scheduled in
1835          * - no scheduled out in between
1836          * - thus the timestamp would be the same
1837          *
1838          * But this is a bit hairy.
1839          *
1840          * So instead, we have an explicit cgroup call to remain
1841          * within the time time source all along. We believe it
1842          * is cleaner and simpler to understand.
1843          */
1844         if (is_cgroup_event(event))
1845                 perf_cgroup_set_shadow_time(event, tstamp);
1846         else
1847                 event->shadow_ctx_time = tstamp - ctx->timestamp;
1848 }
1849
1850 #define MAX_INTERRUPTS (~0ULL)
1851
1852 static void perf_log_throttle(struct perf_event *event, int enable);
1853
1854 static int
1855 event_sched_in(struct perf_event *event,
1856                  struct perf_cpu_context *cpuctx,
1857                  struct perf_event_context *ctx)
1858 {
1859         u64 tstamp = perf_event_time(event);
1860         int ret = 0;
1861
1862         lockdep_assert_held(&ctx->lock);
1863
1864         if (event->state <= PERF_EVENT_STATE_OFF)
1865                 return 0;
1866
1867         event->state = PERF_EVENT_STATE_ACTIVE;
1868         event->oncpu = smp_processor_id();
1869
1870         /*
1871          * Unthrottle events, since we scheduled we might have missed several
1872          * ticks already, also for a heavily scheduling task there is little
1873          * guarantee it'll get a tick in a timely manner.
1874          */
1875         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1876                 perf_log_throttle(event, 1);
1877                 event->hw.interrupts = 0;
1878         }
1879
1880         /*
1881          * The new state must be visible before we turn it on in the hardware:
1882          */
1883         smp_wmb();
1884
1885         perf_pmu_disable(event->pmu);
1886
1887         event->tstamp_running += tstamp - event->tstamp_stopped;
1888
1889         perf_set_shadow_time(event, ctx, tstamp);
1890
1891         if (event->pmu->add(event, PERF_EF_START)) {
1892                 event->state = PERF_EVENT_STATE_INACTIVE;
1893                 event->oncpu = -1;
1894                 ret = -EAGAIN;
1895                 goto out;
1896         }
1897
1898         if (!is_software_event(event))
1899                 cpuctx->active_oncpu++;
1900         if (!ctx->nr_active++)
1901                 perf_event_ctx_activate(ctx);
1902         if (event->attr.freq && event->attr.sample_freq)
1903                 ctx->nr_freq++;
1904
1905         if (event->attr.exclusive)
1906                 cpuctx->exclusive = 1;
1907
1908         if (is_orphaned_child(event))
1909                 schedule_orphans_remove(ctx);
1910
1911 out:
1912         perf_pmu_enable(event->pmu);
1913
1914         return ret;
1915 }
1916
1917 static int
1918 group_sched_in(struct perf_event *group_event,
1919                struct perf_cpu_context *cpuctx,
1920                struct perf_event_context *ctx)
1921 {
1922         struct perf_event *event, *partial_group = NULL;
1923         struct pmu *pmu = ctx->pmu;
1924         u64 now = ctx->time;
1925         bool simulate = false;
1926
1927         if (group_event->state == PERF_EVENT_STATE_OFF)
1928                 return 0;
1929
1930         pmu->start_txn(pmu);
1931
1932         if (event_sched_in(group_event, cpuctx, ctx)) {
1933                 pmu->cancel_txn(pmu);
1934                 perf_cpu_hrtimer_restart(cpuctx);
1935                 return -EAGAIN;
1936         }
1937
1938         /*
1939          * Schedule in siblings as one group (if any):
1940          */
1941         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1942                 if (event_sched_in(event, cpuctx, ctx)) {
1943                         partial_group = event;
1944                         goto group_error;
1945                 }
1946         }
1947
1948         if (!pmu->commit_txn(pmu))
1949                 return 0;
1950
1951 group_error:
1952         /*
1953          * Groups can be scheduled in as one unit only, so undo any
1954          * partial group before returning:
1955          * The events up to the failed event are scheduled out normally,
1956          * tstamp_stopped will be updated.
1957          *
1958          * The failed events and the remaining siblings need to have
1959          * their timings updated as if they had gone thru event_sched_in()
1960          * and event_sched_out(). This is required to get consistent timings
1961          * across the group. This also takes care of the case where the group
1962          * could never be scheduled by ensuring tstamp_stopped is set to mark
1963          * the time the event was actually stopped, such that time delta
1964          * calculation in update_event_times() is correct.
1965          */
1966         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1967                 if (event == partial_group)
1968                         simulate = true;
1969
1970                 if (simulate) {
1971                         event->tstamp_running += now - event->tstamp_stopped;
1972                         event->tstamp_stopped = now;
1973                 } else {
1974                         event_sched_out(event, cpuctx, ctx);
1975                 }
1976         }
1977         event_sched_out(group_event, cpuctx, ctx);
1978
1979         pmu->cancel_txn(pmu);
1980
1981         perf_cpu_hrtimer_restart(cpuctx);
1982
1983         return -EAGAIN;
1984 }
1985
1986 /*
1987  * Work out whether we can put this event group on the CPU now.
1988  */
1989 static int group_can_go_on(struct perf_event *event,
1990                            struct perf_cpu_context *cpuctx,
1991                            int can_add_hw)
1992 {
1993         /*
1994          * Groups consisting entirely of software events can always go on.
1995          */
1996         if (event->group_flags & PERF_GROUP_SOFTWARE)
1997                 return 1;
1998         /*
1999          * If an exclusive group is already on, no other hardware
2000          * events can go on.
2001          */
2002         if (cpuctx->exclusive)
2003                 return 0;
2004         /*
2005          * If this group is exclusive and there are already
2006          * events on the CPU, it can't go on.
2007          */
2008         if (event->attr.exclusive && cpuctx->active_oncpu)
2009                 return 0;
2010         /*
2011          * Otherwise, try to add it if all previous groups were able
2012          * to go on.
2013          */
2014         return can_add_hw;
2015 }
2016
2017 static void add_event_to_ctx(struct perf_event *event,
2018                                struct perf_event_context *ctx)
2019 {
2020         u64 tstamp = perf_event_time(event);
2021
2022         list_add_event(event, ctx);
2023         perf_group_attach(event);
2024         event->tstamp_enabled = tstamp;
2025         event->tstamp_running = tstamp;
2026         event->tstamp_stopped = tstamp;
2027 }
2028
2029 static void task_ctx_sched_out(struct perf_event_context *ctx);
2030 static void
2031 ctx_sched_in(struct perf_event_context *ctx,
2032              struct perf_cpu_context *cpuctx,
2033              enum event_type_t event_type,
2034              struct task_struct *task);
2035
2036 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2037                                 struct perf_event_context *ctx,
2038                                 struct task_struct *task)
2039 {
2040         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2041         if (ctx)
2042                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2043         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2044         if (ctx)
2045                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2046 }
2047
2048 /*
2049  * Cross CPU call to install and enable a performance event
2050  *
2051  * Must be called with ctx->mutex held
2052  */
2053 static int  __perf_install_in_context(void *info)
2054 {
2055         struct perf_event *event = info;
2056         struct perf_event_context *ctx = event->ctx;
2057         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2058         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2059         struct task_struct *task = current;
2060
2061         perf_ctx_lock(cpuctx, task_ctx);
2062         perf_pmu_disable(cpuctx->ctx.pmu);
2063
2064         /*
2065          * If there was an active task_ctx schedule it out.
2066          */
2067         if (task_ctx)
2068                 task_ctx_sched_out(task_ctx);
2069
2070         /*
2071          * If the context we're installing events in is not the
2072          * active task_ctx, flip them.
2073          */
2074         if (ctx->task && task_ctx != ctx) {
2075                 if (task_ctx)
2076                         raw_spin_unlock(&task_ctx->lock);
2077                 raw_spin_lock(&ctx->lock);
2078                 task_ctx = ctx;
2079         }
2080
2081         if (task_ctx) {
2082                 cpuctx->task_ctx = task_ctx;
2083                 task = task_ctx->task;
2084         }
2085
2086         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2087
2088         update_context_time(ctx);
2089         /*
2090          * update cgrp time only if current cgrp
2091          * matches event->cgrp. Must be done before
2092          * calling add_event_to_ctx()
2093          */
2094         update_cgrp_time_from_event(event);
2095
2096         add_event_to_ctx(event, ctx);
2097
2098         /*
2099          * Schedule everything back in
2100          */
2101         perf_event_sched_in(cpuctx, task_ctx, task);
2102
2103         perf_pmu_enable(cpuctx->ctx.pmu);
2104         perf_ctx_unlock(cpuctx, task_ctx);
2105
2106         return 0;
2107 }
2108
2109 /*
2110  * Attach a performance event to a context
2111  *
2112  * First we add the event to the list with the hardware enable bit
2113  * in event->hw_config cleared.
2114  *
2115  * If the event is attached to a task which is on a CPU we use a smp
2116  * call to enable it in the task context. The task might have been
2117  * scheduled away, but we check this in the smp call again.
2118  */
2119 static void
2120 perf_install_in_context(struct perf_event_context *ctx,
2121                         struct perf_event *event,
2122                         int cpu)
2123 {
2124         struct task_struct *task = ctx->task;
2125
2126         lockdep_assert_held(&ctx->mutex);
2127
2128         event->ctx = ctx;
2129         if (event->cpu != -1)
2130                 event->cpu = cpu;
2131
2132         if (!task) {
2133                 /*
2134                  * Per cpu events are installed via an smp call and
2135                  * the install is always successful.
2136                  */
2137                 cpu_function_call(cpu, __perf_install_in_context, event);
2138                 return;
2139         }
2140
2141 retry:
2142         if (!task_function_call(task, __perf_install_in_context, event))
2143                 return;
2144
2145         raw_spin_lock_irq(&ctx->lock);
2146         /*
2147          * If we failed to find a running task, but find the context active now
2148          * that we've acquired the ctx->lock, retry.
2149          */
2150         if (ctx->is_active) {
2151                 raw_spin_unlock_irq(&ctx->lock);
2152                 /*
2153                  * Reload the task pointer, it might have been changed by
2154                  * a concurrent perf_event_context_sched_out().
2155                  */
2156                 task = ctx->task;
2157                 goto retry;
2158         }
2159
2160         /*
2161          * Since the task isn't running, its safe to add the event, us holding
2162          * the ctx->lock ensures the task won't get scheduled in.
2163          */
2164         add_event_to_ctx(event, ctx);
2165         raw_spin_unlock_irq(&ctx->lock);
2166 }
2167
2168 /*
2169  * Put a event into inactive state and update time fields.
2170  * Enabling the leader of a group effectively enables all
2171  * the group members that aren't explicitly disabled, so we
2172  * have to update their ->tstamp_enabled also.
2173  * Note: this works for group members as well as group leaders
2174  * since the non-leader members' sibling_lists will be empty.
2175  */
2176 static void __perf_event_mark_enabled(struct perf_event *event)
2177 {
2178         struct perf_event *sub;
2179         u64 tstamp = perf_event_time(event);
2180
2181         event->state = PERF_EVENT_STATE_INACTIVE;
2182         event->tstamp_enabled = tstamp - event->total_time_enabled;
2183         list_for_each_entry(sub, &event->sibling_list, group_entry) {
2184                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2185                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2186         }
2187 }
2188
2189 /*
2190  * Cross CPU call to enable a performance event
2191  */
2192 static int __perf_event_enable(void *info)
2193 {
2194         struct perf_event *event = info;
2195         struct perf_event_context *ctx = event->ctx;
2196         struct perf_event *leader = event->group_leader;
2197         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2198         int err;
2199
2200         /*
2201          * There's a time window between 'ctx->is_active' check
2202          * in perf_event_enable function and this place having:
2203          *   - IRQs on
2204          *   - ctx->lock unlocked
2205          *
2206          * where the task could be killed and 'ctx' deactivated
2207          * by perf_event_exit_task.
2208          */
2209         if (!ctx->is_active)
2210                 return -EINVAL;
2211
2212         raw_spin_lock(&ctx->lock);
2213         update_context_time(ctx);
2214
2215         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2216                 goto unlock;
2217
2218         /*
2219          * set current task's cgroup time reference point
2220          */
2221         perf_cgroup_set_timestamp(current, ctx);
2222
2223         __perf_event_mark_enabled(event);
2224
2225         if (!event_filter_match(event)) {
2226                 if (is_cgroup_event(event))
2227                         perf_cgroup_defer_enabled(event);
2228                 goto unlock;
2229         }
2230
2231         /*
2232          * If the event is in a group and isn't the group leader,
2233          * then don't put it on unless the group is on.
2234          */
2235         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2236                 goto unlock;
2237
2238         if (!group_can_go_on(event, cpuctx, 1)) {
2239                 err = -EEXIST;
2240         } else {
2241                 if (event == leader)
2242                         err = group_sched_in(event, cpuctx, ctx);
2243                 else
2244                         err = event_sched_in(event, cpuctx, ctx);
2245         }
2246
2247         if (err) {
2248                 /*
2249                  * If this event can't go on and it's part of a
2250                  * group, then the whole group has to come off.
2251                  */
2252                 if (leader != event) {
2253                         group_sched_out(leader, cpuctx, ctx);
2254                         perf_cpu_hrtimer_restart(cpuctx);
2255                 }
2256                 if (leader->attr.pinned) {
2257                         update_group_times(leader);
2258                         leader->state = PERF_EVENT_STATE_ERROR;
2259                 }
2260         }
2261
2262 unlock:
2263         raw_spin_unlock(&ctx->lock);
2264
2265         return 0;
2266 }
2267
2268 /*
2269  * Enable a event.
2270  *
2271  * If event->ctx is a cloned context, callers must make sure that
2272  * every task struct that event->ctx->task could possibly point to
2273  * remains valid.  This condition is satisfied when called through
2274  * perf_event_for_each_child or perf_event_for_each as described
2275  * for perf_event_disable.
2276  */
2277 static void _perf_event_enable(struct perf_event *event)
2278 {
2279         struct perf_event_context *ctx = event->ctx;
2280         struct task_struct *task = ctx->task;
2281
2282         if (!task) {
2283                 /*
2284                  * Enable the event on the cpu that it's on
2285                  */
2286                 cpu_function_call(event->cpu, __perf_event_enable, event);
2287                 return;
2288         }
2289
2290         raw_spin_lock_irq(&ctx->lock);
2291         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2292                 goto out;
2293
2294         /*
2295          * If the event is in error state, clear that first.
2296          * That way, if we see the event in error state below, we
2297          * know that it has gone back into error state, as distinct
2298          * from the task having been scheduled away before the
2299          * cross-call arrived.
2300          */
2301         if (event->state == PERF_EVENT_STATE_ERROR)
2302                 event->state = PERF_EVENT_STATE_OFF;
2303
2304 retry:
2305         if (!ctx->is_active) {
2306                 __perf_event_mark_enabled(event);
2307                 goto out;
2308         }
2309
2310         raw_spin_unlock_irq(&ctx->lock);
2311
2312         if (!task_function_call(task, __perf_event_enable, event))
2313                 return;
2314
2315         raw_spin_lock_irq(&ctx->lock);
2316
2317         /*
2318          * If the context is active and the event is still off,
2319          * we need to retry the cross-call.
2320          */
2321         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2322                 /*
2323                  * task could have been flipped by a concurrent
2324                  * perf_event_context_sched_out()
2325                  */
2326                 task = ctx->task;
2327                 goto retry;
2328         }
2329
2330 out:
2331         raw_spin_unlock_irq(&ctx->lock);
2332 }
2333
2334 /*
2335  * See perf_event_disable();
2336  */
2337 void perf_event_enable(struct perf_event *event)
2338 {
2339         struct perf_event_context *ctx;
2340
2341         ctx = perf_event_ctx_lock(event);
2342         _perf_event_enable(event);
2343         perf_event_ctx_unlock(event, ctx);
2344 }
2345 EXPORT_SYMBOL_GPL(perf_event_enable);
2346
2347 static int _perf_event_refresh(struct perf_event *event, int refresh)
2348 {
2349         /*
2350          * not supported on inherited events
2351          */
2352         if (event->attr.inherit || !is_sampling_event(event))
2353                 return -EINVAL;
2354
2355         atomic_add(refresh, &event->event_limit);
2356         _perf_event_enable(event);
2357
2358         return 0;
2359 }
2360
2361 /*
2362  * See perf_event_disable()
2363  */
2364 int perf_event_refresh(struct perf_event *event, int refresh)
2365 {
2366         struct perf_event_context *ctx;
2367         int ret;
2368
2369         ctx = perf_event_ctx_lock(event);
2370         ret = _perf_event_refresh(event, refresh);
2371         perf_event_ctx_unlock(event, ctx);
2372
2373         return ret;
2374 }
2375 EXPORT_SYMBOL_GPL(perf_event_refresh);
2376
2377 static void ctx_sched_out(struct perf_event_context *ctx,
2378                           struct perf_cpu_context *cpuctx,
2379                           enum event_type_t event_type)
2380 {
2381         struct perf_event *event;
2382         int is_active = ctx->is_active;
2383
2384         ctx->is_active &= ~event_type;
2385         if (likely(!ctx->nr_events))
2386                 return;
2387
2388         update_context_time(ctx);
2389         update_cgrp_time_from_cpuctx(cpuctx);
2390         if (!ctx->nr_active)
2391                 return;
2392
2393         perf_pmu_disable(ctx->pmu);
2394         if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2395                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2396                         group_sched_out(event, cpuctx, ctx);
2397         }
2398
2399         if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2400                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2401                         group_sched_out(event, cpuctx, ctx);
2402         }
2403         perf_pmu_enable(ctx->pmu);
2404 }
2405
2406 /*
2407  * Test whether two contexts are equivalent, i.e. whether they have both been
2408  * cloned from the same version of the same context.
2409  *
2410  * Equivalence is measured using a generation number in the context that is
2411  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2412  * and list_del_event().
2413  */
2414 static int context_equiv(struct perf_event_context *ctx1,
2415                          struct perf_event_context *ctx2)
2416 {
2417         lockdep_assert_held(&ctx1->lock);
2418         lockdep_assert_held(&ctx2->lock);
2419
2420         /* Pinning disables the swap optimization */
2421         if (ctx1->pin_count || ctx2->pin_count)
2422                 return 0;
2423
2424         /* If ctx1 is the parent of ctx2 */
2425         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2426                 return 1;
2427
2428         /* If ctx2 is the parent of ctx1 */
2429         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2430                 return 1;
2431
2432         /*
2433          * If ctx1 and ctx2 have the same parent; we flatten the parent
2434          * hierarchy, see perf_event_init_context().
2435          */
2436         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2437                         ctx1->parent_gen == ctx2->parent_gen)
2438                 return 1;
2439
2440         /* Unmatched */
2441         return 0;
2442 }
2443
2444 static void __perf_event_sync_stat(struct perf_event *event,
2445                                      struct perf_event *next_event)
2446 {
2447         u64 value;
2448
2449         if (!event->attr.inherit_stat)
2450                 return;
2451
2452         /*
2453          * Update the event value, we cannot use perf_event_read()
2454          * because we're in the middle of a context switch and have IRQs
2455          * disabled, which upsets smp_call_function_single(), however
2456          * we know the event must be on the current CPU, therefore we
2457          * don't need to use it.
2458          */
2459         switch (event->state) {
2460         case PERF_EVENT_STATE_ACTIVE:
2461                 event->pmu->read(event);
2462                 /* fall-through */
2463
2464         case PERF_EVENT_STATE_INACTIVE:
2465                 update_event_times(event);
2466                 break;
2467
2468         default:
2469                 break;
2470         }
2471
2472         /*
2473          * In order to keep per-task stats reliable we need to flip the event
2474          * values when we flip the contexts.
2475          */
2476         value = local64_read(&next_event->count);
2477         value = local64_xchg(&event->count, value);
2478         local64_set(&next_event->count, value);
2479
2480         swap(event->total_time_enabled, next_event->total_time_enabled);
2481         swap(event->total_time_running, next_event->total_time_running);
2482
2483         /*
2484          * Since we swizzled the values, update the user visible data too.
2485          */
2486         perf_event_update_userpage(event);
2487         perf_event_update_userpage(next_event);
2488 }
2489
2490 static void perf_event_sync_stat(struct perf_event_context *ctx,
2491                                    struct perf_event_context *next_ctx)
2492 {
2493         struct perf_event *event, *next_event;
2494
2495         if (!ctx->nr_stat)
2496                 return;
2497
2498         update_context_time(ctx);
2499
2500         event = list_first_entry(&ctx->event_list,
2501                                    struct perf_event, event_entry);
2502
2503         next_event = list_first_entry(&next_ctx->event_list,
2504                                         struct perf_event, event_entry);
2505
2506         while (&event->event_entry != &ctx->event_list &&
2507                &next_event->event_entry != &next_ctx->event_list) {
2508
2509                 __perf_event_sync_stat(event, next_event);
2510
2511                 event = list_next_entry(event, event_entry);
2512                 next_event = list_next_entry(next_event, event_entry);
2513         }
2514 }
2515
2516 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2517                                          struct task_struct *next)
2518 {
2519         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2520         struct perf_event_context *next_ctx;
2521         struct perf_event_context *parent, *next_parent;
2522         struct perf_cpu_context *cpuctx;
2523         int do_switch = 1;
2524
2525         if (likely(!ctx))
2526                 return;
2527
2528         cpuctx = __get_cpu_context(ctx);
2529         if (!cpuctx->task_ctx)
2530                 return;
2531
2532         rcu_read_lock();
2533         next_ctx = next->perf_event_ctxp[ctxn];
2534         if (!next_ctx)
2535                 goto unlock;
2536
2537         parent = rcu_dereference(ctx->parent_ctx);
2538         next_parent = rcu_dereference(next_ctx->parent_ctx);
2539
2540         /* If neither context have a parent context; they cannot be clones. */
2541         if (!parent && !next_parent)
2542                 goto unlock;
2543
2544         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2545                 /*
2546                  * Looks like the two contexts are clones, so we might be
2547                  * able to optimize the context switch.  We lock both
2548                  * contexts and check that they are clones under the
2549                  * lock (including re-checking that neither has been
2550                  * uncloned in the meantime).  It doesn't matter which
2551                  * order we take the locks because no other cpu could
2552                  * be trying to lock both of these tasks.
2553                  */
2554                 raw_spin_lock(&ctx->lock);
2555                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2556                 if (context_equiv(ctx, next_ctx)) {
2557                         /*
2558                          * XXX do we need a memory barrier of sorts
2559                          * wrt to rcu_dereference() of perf_event_ctxp
2560                          */
2561                         task->perf_event_ctxp[ctxn] = next_ctx;
2562                         next->perf_event_ctxp[ctxn] = ctx;
2563                         ctx->task = next;
2564                         next_ctx->task = task;
2565
2566                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2567
2568                         do_switch = 0;
2569
2570                         perf_event_sync_stat(ctx, next_ctx);
2571                 }
2572                 raw_spin_unlock(&next_ctx->lock);
2573                 raw_spin_unlock(&ctx->lock);
2574         }
2575 unlock:
2576         rcu_read_unlock();
2577
2578         if (do_switch) {
2579                 raw_spin_lock(&ctx->lock);
2580                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2581                 cpuctx->task_ctx = NULL;
2582                 raw_spin_unlock(&ctx->lock);
2583         }
2584 }
2585
2586 void perf_sched_cb_dec(struct pmu *pmu)
2587 {
2588         this_cpu_dec(perf_sched_cb_usages);
2589 }
2590
2591 void perf_sched_cb_inc(struct pmu *pmu)
2592 {
2593         this_cpu_inc(perf_sched_cb_usages);
2594 }
2595
2596 /*
2597  * This function provides the context switch callback to the lower code
2598  * layer. It is invoked ONLY when the context switch callback is enabled.
2599  */
2600 static void perf_pmu_sched_task(struct task_struct *prev,
2601                                 struct task_struct *next,
2602                                 bool sched_in)
2603 {
2604         struct perf_cpu_context *cpuctx;
2605         struct pmu *pmu;
2606         unsigned long flags;
2607
2608         if (prev == next)
2609                 return;
2610
2611         local_irq_save(flags);
2612
2613         rcu_read_lock();
2614
2615         list_for_each_entry_rcu(pmu, &pmus, entry) {
2616                 if (pmu->sched_task) {
2617                         cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2618
2619                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2620
2621                         perf_pmu_disable(pmu);
2622
2623                         pmu->sched_task(cpuctx->task_ctx, sched_in);
2624
2625                         perf_pmu_enable(pmu);
2626
2627                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2628                 }
2629         }
2630
2631         rcu_read_unlock();
2632
2633         local_irq_restore(flags);
2634 }
2635
2636 #define for_each_task_context_nr(ctxn)                                  \
2637         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2638
2639 /*
2640  * Called from scheduler to remove the events of the current task,
2641  * with interrupts disabled.
2642  *
2643  * We stop each event and update the event value in event->count.
2644  *
2645  * This does not protect us against NMI, but disable()
2646  * sets the disabled bit in the control field of event _before_
2647  * accessing the event control register. If a NMI hits, then it will
2648  * not restart the event.
2649  */
2650 void __perf_event_task_sched_out(struct task_struct *task,
2651                                  struct task_struct *next)
2652 {
2653         int ctxn;
2654
2655         if (__this_cpu_read(perf_sched_cb_usages))
2656                 perf_pmu_sched_task(task, next, false);
2657
2658         for_each_task_context_nr(ctxn)
2659                 perf_event_context_sched_out(task, ctxn, next);
2660
2661         /*
2662          * if cgroup events exist on this CPU, then we need
2663          * to check if we have to switch out PMU state.
2664          * cgroup event are system-wide mode only
2665          */
2666         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2667                 perf_cgroup_sched_out(task, next);
2668 }
2669
2670 static void task_ctx_sched_out(struct perf_event_context *ctx)
2671 {
2672         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2673
2674         if (!cpuctx->task_ctx)
2675                 return;
2676
2677         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2678                 return;
2679
2680         ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2681         cpuctx->task_ctx = NULL;
2682 }
2683
2684 /*
2685  * Called with IRQs disabled
2686  */
2687 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2688                               enum event_type_t event_type)
2689 {
2690         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2691 }
2692
2693 static void
2694 ctx_pinned_sched_in(struct perf_event_context *ctx,
2695                     struct perf_cpu_context *cpuctx)
2696 {
2697         struct perf_event *event;
2698
2699         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2700                 if (event->state <= PERF_EVENT_STATE_OFF)
2701                         continue;
2702                 if (!event_filter_match(event))
2703                         continue;
2704
2705                 /* may need to reset tstamp_enabled */
2706                 if (is_cgroup_event(event))
2707                         perf_cgroup_mark_enabled(event, ctx);
2708
2709                 if (group_can_go_on(event, cpuctx, 1))
2710                         group_sched_in(event, cpuctx, ctx);
2711
2712                 /*
2713                  * If this pinned group hasn't been scheduled,
2714                  * put it in error state.
2715                  */
2716                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2717                         update_group_times(event);
2718                         event->state = PERF_EVENT_STATE_ERROR;
2719                 }
2720         }
2721 }
2722
2723 static void
2724 ctx_flexible_sched_in(struct perf_event_context *ctx,
2725                       struct perf_cpu_context *cpuctx)
2726 {
2727         struct perf_event *event;
2728         int can_add_hw = 1;
2729
2730         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2731                 /* Ignore events in OFF or ERROR state */
2732                 if (event->state <= PERF_EVENT_STATE_OFF)
2733                         continue;
2734                 /*
2735                  * Listen to the 'cpu' scheduling filter constraint
2736                  * of events:
2737                  */
2738                 if (!event_filter_match(event))
2739                         continue;
2740
2741                 /* may need to reset tstamp_enabled */
2742                 if (is_cgroup_event(event))
2743                         perf_cgroup_mark_enabled(event, ctx);
2744
2745                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2746                         if (group_sched_in(event, cpuctx, ctx))
2747                                 can_add_hw = 0;
2748                 }
2749         }
2750 }
2751
2752 static void
2753 ctx_sched_in(struct perf_event_context *ctx,
2754              struct perf_cpu_context *cpuctx,
2755              enum event_type_t event_type,
2756              struct task_struct *task)
2757 {
2758         u64 now;
2759         int is_active = ctx->is_active;
2760
2761         ctx->is_active |= event_type;
2762         if (likely(!ctx->nr_events))
2763                 return;
2764
2765         now = perf_clock();
2766         ctx->timestamp = now;
2767         perf_cgroup_set_timestamp(task, ctx);
2768         /*
2769          * First go through the list and put on any pinned groups
2770          * in order to give them the best chance of going on.
2771          */
2772         if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2773                 ctx_pinned_sched_in(ctx, cpuctx);
2774
2775         /* Then walk through the lower prio flexible groups */
2776         if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2777                 ctx_flexible_sched_in(ctx, cpuctx);
2778 }
2779
2780 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2781                              enum event_type_t event_type,
2782                              struct task_struct *task)
2783 {
2784         struct perf_event_context *ctx = &cpuctx->ctx;
2785
2786         ctx_sched_in(ctx, cpuctx, event_type, task);
2787 }
2788
2789 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2790                                         struct task_struct *task)
2791 {
2792         struct perf_cpu_context *cpuctx;
2793
2794         cpuctx = __get_cpu_context(ctx);
2795         if (cpuctx->task_ctx == ctx)
2796                 return;
2797
2798         perf_ctx_lock(cpuctx, ctx);
2799         perf_pmu_disable(ctx->pmu);
2800         /*
2801          * We want to keep the following priority order:
2802          * cpu pinned (that don't need to move), task pinned,
2803          * cpu flexible, task flexible.
2804          */
2805         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2806
2807         if (ctx->nr_events)
2808                 cpuctx->task_ctx = ctx;
2809
2810         perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2811
2812         perf_pmu_enable(ctx->pmu);
2813         perf_ctx_unlock(cpuctx, ctx);
2814 }
2815
2816 /*
2817  * Called from scheduler to add the events of the current task
2818  * with interrupts disabled.
2819  *
2820  * We restore the event value and then enable it.
2821  *
2822  * This does not protect us against NMI, but enable()
2823  * sets the enabled bit in the control field of event _before_
2824  * accessing the event control register. If a NMI hits, then it will
2825  * keep the event running.
2826  */
2827 void __perf_event_task_sched_in(struct task_struct *prev,
2828                                 struct task_struct *task)
2829 {
2830         struct perf_event_context *ctx;
2831         int ctxn;
2832
2833         for_each_task_context_nr(ctxn) {
2834                 ctx = task->perf_event_ctxp[ctxn];
2835                 if (likely(!ctx))
2836                         continue;
2837
2838                 perf_event_context_sched_in(ctx, task);
2839         }
2840         /*
2841          * if cgroup events exist on this CPU, then we need
2842          * to check if we have to switch in PMU state.
2843          * cgroup event are system-wide mode only
2844          */
2845         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2846                 perf_cgroup_sched_in(prev, task);
2847
2848         if (__this_cpu_read(perf_sched_cb_usages))
2849                 perf_pmu_sched_task(prev, task, true);
2850 }
2851
2852 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2853 {
2854         u64 frequency = event->attr.sample_freq;
2855         u64 sec = NSEC_PER_SEC;
2856         u64 divisor, dividend;
2857
2858         int count_fls, nsec_fls, frequency_fls, sec_fls;
2859
2860         count_fls = fls64(count);
2861         nsec_fls = fls64(nsec);
2862         frequency_fls = fls64(frequency);
2863         sec_fls = 30;
2864
2865         /*
2866          * We got @count in @nsec, with a target of sample_freq HZ
2867          * the target period becomes:
2868          *
2869          *             @count * 10^9
2870          * period = -------------------
2871          *          @nsec * sample_freq
2872          *
2873          */
2874
2875         /*
2876          * Reduce accuracy by one bit such that @a and @b converge
2877          * to a similar magnitude.
2878          */
2879 #define REDUCE_FLS(a, b)                \
2880 do {                                    \
2881         if (a##_fls > b##_fls) {        \
2882                 a >>= 1;                \
2883                 a##_fls--;              \
2884         } else {                        \
2885                 b >>= 1;                \
2886                 b##_fls--;              \
2887         }                               \
2888 } while (0)
2889
2890         /*
2891          * Reduce accuracy until either term fits in a u64, then proceed with
2892          * the other, so that finally we can do a u64/u64 division.
2893          */
2894         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2895                 REDUCE_FLS(nsec, frequency);
2896                 REDUCE_FLS(sec, count);
2897         }
2898
2899         if (count_fls + sec_fls > 64) {
2900                 divisor = nsec * frequency;
2901
2902                 while (count_fls + sec_fls > 64) {
2903                         REDUCE_FLS(count, sec);
2904                         divisor >>= 1;
2905                 }
2906
2907                 dividend = count * sec;
2908         } else {
2909                 dividend = count * sec;
2910
2911                 while (nsec_fls + frequency_fls > 64) {
2912                         REDUCE_FLS(nsec, frequency);
2913                         dividend >>= 1;
2914                 }
2915
2916                 divisor = nsec * frequency;
2917         }
2918
2919         if (!divisor)
2920                 return dividend;
2921
2922         return div64_u64(dividend, divisor);
2923 }
2924
2925 static DEFINE_PER_CPU(int, perf_throttled_count);
2926 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2927
2928 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2929 {
2930         struct hw_perf_event *hwc = &event->hw;
2931         s64 period, sample_period;
2932         s64 delta;
2933
2934         period = perf_calculate_period(event, nsec, count);
2935
2936         delta = (s64)(period - hwc->sample_period);
2937         delta = (delta + 7) / 8; /* low pass filter */
2938
2939         sample_period = hwc->sample_period + delta;
2940
2941         if (!sample_period)
2942                 sample_period = 1;
2943
2944         hwc->sample_period = sample_period;
2945
2946         if (local64_read(&hwc->period_left) > 8*sample_period) {
2947                 if (disable)
2948                         event->pmu->stop(event, PERF_EF_UPDATE);
2949
2950                 local64_set(&hwc->period_left, 0);
2951
2952                 if (disable)
2953                         event->pmu->start(event, PERF_EF_RELOAD);
2954         }
2955 }
2956
2957 /*
2958  * combine freq adjustment with unthrottling to avoid two passes over the
2959  * events. At the same time, make sure, having freq events does not change
2960  * the rate of unthrottling as that would introduce bias.
2961  */
2962 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2963                                            int needs_unthr)
2964 {
2965         struct perf_event *event;
2966         struct hw_perf_event *hwc;
2967         u64 now, period = TICK_NSEC;
2968         s64 delta;
2969
2970         /*
2971          * only need to iterate over all events iff:
2972          * - context have events in frequency mode (needs freq adjust)
2973          * - there are events to unthrottle on this cpu
2974          */
2975         if (!(ctx->nr_freq || needs_unthr))
2976                 return;
2977
2978         raw_spin_lock(&ctx->lock);
2979         perf_pmu_disable(ctx->pmu);
2980
2981         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2982                 if (event->state != PERF_EVENT_STATE_ACTIVE)
2983                         continue;
2984
2985                 if (!event_filter_match(event))
2986                         continue;
2987
2988                 perf_pmu_disable(event->pmu);
2989
2990                 hwc = &event->hw;
2991
2992                 if (hwc->interrupts == MAX_INTERRUPTS) {
2993                         hwc->interrupts = 0;
2994                         perf_log_throttle(event, 1);
2995                         event->pmu->start(event, 0);
2996                 }
2997
2998                 if (!event->attr.freq || !event->attr.sample_freq)
2999                         goto next;
3000
3001                 /*
3002                  * stop the event and update event->count
3003                  */
3004                 event->pmu->stop(event, PERF_EF_UPDATE);
3005
3006                 now = local64_read(&event->count);
3007                 delta = now - hwc->freq_count_stamp;
3008                 hwc->freq_count_stamp = now;
3009
3010                 /*
3011                  * restart the event
3012                  * reload only if value has changed
3013                  * we have stopped the event so tell that
3014                  * to perf_adjust_period() to avoid stopping it
3015                  * twice.
3016                  */
3017                 if (delta > 0)
3018                         perf_adjust_period(event, period, delta, false);
3019
3020                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3021         next:
3022                 perf_pmu_enable(event->pmu);
3023         }
3024
3025         perf_pmu_enable(ctx->pmu);
3026         raw_spin_unlock(&ctx->lock);
3027 }
3028
3029 /*
3030  * Round-robin a context's events:
3031  */
3032 static void rotate_ctx(struct perf_event_context *ctx)
3033 {
3034         /*
3035          * Rotate the first entry last of non-pinned groups. Rotation might be
3036          * disabled by the inheritance code.
3037          */
3038         if (!ctx->rotate_disable)
3039                 list_rotate_left(&ctx->flexible_groups);
3040 }
3041
3042 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3043 {
3044         struct perf_event_context *ctx = NULL;
3045         int rotate = 0;
3046
3047         if (cpuctx->ctx.nr_events) {
3048                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3049                         rotate = 1;
3050         }
3051
3052         ctx = cpuctx->task_ctx;
3053         if (ctx && ctx->nr_events) {
3054                 if (ctx->nr_events != ctx->nr_active)
3055                         rotate = 1;
3056         }
3057
3058         if (!rotate)
3059                 goto done;
3060
3061         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3062         perf_pmu_disable(cpuctx->ctx.pmu);
3063
3064         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3065         if (ctx)
3066                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3067
3068         rotate_ctx(&cpuctx->ctx);
3069         if (ctx)
3070                 rotate_ctx(ctx);
3071
3072         perf_event_sched_in(cpuctx, ctx, current);
3073
3074         perf_pmu_enable(cpuctx->ctx.pmu);
3075         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3076 done:
3077
3078         return rotate;
3079 }
3080
3081 #ifdef CONFIG_NO_HZ_FULL
3082 bool perf_event_can_stop_tick(void)
3083 {
3084         if (atomic_read(&nr_freq_events) ||
3085             __this_cpu_read(perf_throttled_count))
3086                 return false;
3087         else
3088                 return true;
3089 }
3090 #endif
3091
3092 void perf_event_task_tick(void)
3093 {
3094         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3095         struct perf_event_context *ctx, *tmp;
3096         int throttled;
3097
3098         WARN_ON(!irqs_disabled());
3099
3100         __this_cpu_inc(perf_throttled_seq);
3101         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3102
3103         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3104                 perf_adjust_freq_unthr_context(ctx, throttled);
3105 }
3106
3107 static int event_enable_on_exec(struct perf_event *event,
3108                                 struct perf_event_context *ctx)
3109 {
3110         if (!event->attr.enable_on_exec)
3111                 return 0;
3112
3113         event->attr.enable_on_exec = 0;
3114         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3115                 return 0;
3116
3117         __perf_event_mark_enabled(event);
3118
3119         return 1;
3120 }
3121
3122 /*
3123  * Enable all of a task's events that have been marked enable-on-exec.
3124  * This expects task == current.
3125  */
3126 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
3127 {
3128         struct perf_event_context *clone_ctx = NULL;
3129         struct perf_event *event;
3130         unsigned long flags;
3131         int enabled = 0;
3132         int ret;
3133
3134         local_irq_save(flags);
3135         if (!ctx || !ctx->nr_events)
3136                 goto out;
3137
3138         /*
3139          * We must ctxsw out cgroup events to avoid conflict
3140          * when invoking perf_task_event_sched_in() later on
3141          * in this function. Otherwise we end up trying to
3142          * ctxswin cgroup events which are already scheduled
3143          * in.
3144          */
3145         perf_cgroup_sched_out(current, NULL);
3146
3147         raw_spin_lock(&ctx->lock);
3148         task_ctx_sched_out(ctx);
3149
3150         list_for_each_entry(event, &ctx->event_list, event_entry) {
3151                 ret = event_enable_on_exec(event, ctx);
3152                 if (ret)
3153                         enabled = 1;
3154         }
3155
3156         /*
3157          * Unclone this context if we enabled any event.
3158          */
3159         if (enabled)
3160                 clone_ctx = unclone_ctx(ctx);
3161
3162         raw_spin_unlock(&ctx->lock);
3163
3164         /*
3165          * Also calls ctxswin for cgroup events, if any:
3166          */
3167         perf_event_context_sched_in(ctx, ctx->task);
3168 out:
3169         local_irq_restore(flags);
3170
3171         if (clone_ctx)
3172                 put_ctx(clone_ctx);
3173 }
3174
3175 void perf_event_exec(void)
3176 {
3177         struct perf_event_context *ctx;
3178         int ctxn;
3179
3180         rcu_read_lock();
3181         for_each_task_context_nr(ctxn) {
3182                 ctx = current->perf_event_ctxp[ctxn];
3183                 if (!ctx)
3184                         continue;
3185
3186                 perf_event_enable_on_exec(ctx);
3187         }
3188         rcu_read_unlock();
3189 }
3190
3191 /*
3192  * Cross CPU call to read the hardware event
3193  */
3194 static void __perf_event_read(void *info)
3195 {
3196         struct perf_event *event = info;
3197         struct perf_event_context *ctx = event->ctx;
3198         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3199
3200         /*
3201          * If this is a task context, we need to check whether it is
3202          * the current task context of this cpu.  If not it has been
3203          * scheduled out before the smp call arrived.  In that case
3204          * event->count would have been updated to a recent sample
3205          * when the event was scheduled out.
3206          */
3207         if (ctx->task && cpuctx->task_ctx != ctx)
3208                 return;
3209
3210         raw_spin_lock(&ctx->lock);
3211         if (ctx->is_active) {
3212                 update_context_time(ctx);
3213                 update_cgrp_time_from_event(event);
3214         }
3215         update_event_times(event);
3216         if (event->state == PERF_EVENT_STATE_ACTIVE)
3217                 event->pmu->read(event);
3218         raw_spin_unlock(&ctx->lock);
3219 }
3220
3221 static inline u64 perf_event_count(struct perf_event *event)
3222 {
3223         return local64_read(&event->count) + atomic64_read(&event->child_count);
3224 }
3225
3226 static u64 perf_event_read(struct perf_event *event)
3227 {
3228         /*
3229          * If event is enabled and currently active on a CPU, update the
3230          * value in the event structure:
3231          */
3232         if (event->state == PERF_EVENT_STATE_ACTIVE) {
3233                 smp_call_function_single(event->oncpu,
3234                                          __perf_event_read, event, 1);
3235         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3236                 struct perf_event_context *ctx = event->ctx;
3237                 unsigned long flags;
3238
3239                 raw_spin_lock_irqsave(&ctx->lock, flags);
3240                 /*
3241                  * may read while context is not active
3242                  * (e.g., thread is blocked), in that case
3243                  * we cannot update context time
3244                  */
3245                 if (ctx->is_active) {
3246                         update_context_time(ctx);
3247                         update_cgrp_time_from_event(event);
3248                 }
3249                 update_event_times(event);
3250                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3251         }
3252
3253         return perf_event_count(event);
3254 }
3255
3256 /*
3257  * Initialize the perf_event context in a task_struct:
3258  */
3259 static void __perf_event_init_context(struct perf_event_context *ctx)
3260 {
3261         raw_spin_lock_init(&ctx->lock);
3262         mutex_init(&ctx->mutex);
3263         INIT_LIST_HEAD(&ctx->active_ctx_list);
3264         INIT_LIST_HEAD(&ctx->pinned_groups);
3265         INIT_LIST_HEAD(&ctx->flexible_groups);
3266         INIT_LIST_HEAD(&ctx->event_list);
3267         atomic_set(&ctx->refcount, 1);
3268         INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3269 }
3270
3271 static struct perf_event_context *
3272 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3273 {
3274         struct perf_event_context *ctx;
3275
3276         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3277         if (!ctx)
3278                 return NULL;
3279
3280         __perf_event_init_context(ctx);
3281         if (task) {
3282                 ctx->task = task;
3283                 get_task_struct(task);
3284         }
3285         ctx->pmu = pmu;
3286
3287         return ctx;
3288 }
3289
3290 static struct task_struct *
3291 find_lively_task_by_vpid(pid_t vpid)
3292 {
3293         struct task_struct *task;
3294         int err;
3295
3296         rcu_read_lock();
3297         if (!vpid)
3298                 task = current;
3299         else
3300                 task = find_task_by_vpid(vpid);
3301         if (task)
3302                 get_task_struct(task);
3303         rcu_read_unlock();
3304
3305         if (!task)
3306                 return ERR_PTR(-ESRCH);
3307
3308         /* Reuse ptrace permission checks for now. */
3309         err = -EACCES;
3310         if (!ptrace_may_access(task, PTRACE_MODE_READ))
3311                 goto errout;
3312
3313         return task;
3314 errout:
3315         put_task_struct(task);
3316         return ERR_PTR(err);
3317
3318 }
3319
3320 /*
3321  * Returns a matching context with refcount and pincount.
3322  */
3323 static struct perf_event_context *
3324 find_get_context(struct pmu *pmu, struct task_struct *task,
3325                 struct perf_event *event)
3326 {
3327         struct perf_event_context *ctx, *clone_ctx = NULL;
3328         struct perf_cpu_context *cpuctx;
3329         void *task_ctx_data = NULL;
3330         unsigned long flags;
3331         int ctxn, err;
3332         int cpu = event->cpu;
3333
3334         if (!task) {
3335                 /* Must be root to operate on a CPU event: */
3336                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3337                         return ERR_PTR(-EACCES);
3338
3339                 /*
3340                  * We could be clever and allow to attach a event to an
3341                  * offline CPU and activate it when the CPU comes up, but
3342                  * that's for later.
3343                  */
3344                 if (!cpu_online(cpu))
3345                         return ERR_PTR(-ENODEV);
3346
3347                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3348                 ctx = &cpuctx->ctx;
3349                 get_ctx(ctx);
3350                 ++ctx->pin_count;
3351
3352                 return ctx;
3353         }
3354
3355         err = -EINVAL;
3356         ctxn = pmu->task_ctx_nr;
3357         if (ctxn < 0)
3358                 goto errout;
3359
3360         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3361                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3362                 if (!task_ctx_data) {
3363                         err = -ENOMEM;
3364                         goto errout;
3365                 }
3366         }
3367
3368 retry:
3369         ctx = perf_lock_task_context(task, ctxn, &flags);
3370         if (ctx) {
3371                 clone_ctx = unclone_ctx(ctx);
3372                 ++ctx->pin_count;
3373
3374                 if (task_ctx_data && !ctx->task_ctx_data) {
3375                         ctx->task_ctx_data = task_ctx_data;
3376                         task_ctx_data = NULL;
3377                 }
3378                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3379
3380                 if (clone_ctx)
3381                         put_ctx(clone_ctx);
3382         } else {
3383                 ctx = alloc_perf_context(pmu, task);
3384                 err = -ENOMEM;
3385                 if (!ctx)
3386                         goto errout;
3387
3388                 if (task_ctx_data) {
3389                         ctx->task_ctx_data = task_ctx_data;
3390                         task_ctx_data = NULL;
3391                 }
3392
3393                 err = 0;
3394                 mutex_lock(&task->perf_event_mutex);
3395                 /*
3396                  * If it has already passed perf_event_exit_task().
3397                  * we must see PF_EXITING, it takes this mutex too.
3398                  */
3399                 if (task->flags & PF_EXITING)
3400                         err = -ESRCH;
3401                 else if (task->perf_event_ctxp[ctxn])
3402                         err = -EAGAIN;
3403                 else {
3404                         get_ctx(ctx);
3405                         ++ctx->pin_count;
3406                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3407                 }
3408                 mutex_unlock(&task->perf_event_mutex);
3409
3410                 if (unlikely(err)) {
3411                         put_ctx(ctx);
3412
3413                         if (err == -EAGAIN)
3414                                 goto retry;
3415                         goto errout;
3416                 }
3417         }
3418
3419         kfree(task_ctx_data);
3420         return ctx;
3421
3422 errout:
3423         kfree(task_ctx_data);
3424         return ERR_PTR(err);
3425 }
3426
3427 static void perf_event_free_filter(struct perf_event *event);
3428
3429 static void free_event_rcu(struct rcu_head *head)
3430 {
3431         struct perf_event *event;
3432
3433         event = container_of(head, struct perf_event, rcu_head);
3434         if (event->ns)
3435                 put_pid_ns(event->ns);
3436         perf_event_free_filter(event);
3437         kfree(event);
3438 }
3439
3440 static void ring_buffer_put(struct ring_buffer *rb);
3441 static void ring_buffer_attach(struct perf_event *event,
3442                                struct ring_buffer *rb);
3443
3444 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3445 {
3446         if (event->parent)
3447                 return;
3448
3449         if (is_cgroup_event(event))
3450                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3451 }
3452
3453 static void unaccount_event(struct perf_event *event)
3454 {
3455         if (event->parent)
3456                 return;
3457
3458         if (event->attach_state & PERF_ATTACH_TASK)
3459                 static_key_slow_dec_deferred(&perf_sched_events);
3460         if (event->attr.mmap || event->attr.mmap_data)
3461                 atomic_dec(&nr_mmap_events);
3462         if (event->attr.comm)
3463                 atomic_dec(&nr_comm_events);
3464         if (event->attr.task)
3465                 atomic_dec(&nr_task_events);
3466         if (event->attr.freq)
3467                 atomic_dec(&nr_freq_events);
3468         if (is_cgroup_event(event))
3469                 static_key_slow_dec_deferred(&perf_sched_events);
3470         if (has_branch_stack(event))
3471                 static_key_slow_dec_deferred(&perf_sched_events);
3472
3473         unaccount_event_cpu(event, event->cpu);
3474 }
3475
3476 static void __free_event(struct perf_event *event)
3477 {
3478         if (!event->parent) {
3479                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3480                         put_callchain_buffers();
3481         }
3482
3483         if (event->destroy)
3484                 event->destroy(event);
3485
3486         if (event->ctx)
3487                 put_ctx(event->ctx);
3488
3489         if (event->pmu)
3490                 module_put(event->pmu->module);
3491
3492         call_rcu(&event->rcu_head, free_event_rcu);
3493 }
3494
3495 static void _free_event(struct perf_event *event)
3496 {
3497         irq_work_sync(&event->pending);
3498
3499         unaccount_event(event);
3500
3501         if (event->rb) {
3502                 /*
3503                  * Can happen when we close an event with re-directed output.
3504                  *
3505                  * Since we have a 0 refcount, perf_mmap_close() will skip
3506                  * over us; possibly making our ring_buffer_put() the last.
3507                  */
3508                 mutex_lock(&event->mmap_mutex);
3509                 ring_buffer_attach(event, NULL);
3510                 mutex_unlock(&event->mmap_mutex);
3511         }
3512
3513         if (is_cgroup_event(event))
3514                 perf_detach_cgroup(event);
3515
3516         __free_event(event);
3517 }
3518
3519 /*
3520  * Used to free events which have a known refcount of 1, such as in error paths
3521  * where the event isn't exposed yet and inherited events.
3522  */
3523 static void free_event(struct perf_event *event)
3524 {
3525         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3526                                 "unexpected event refcount: %ld; ptr=%p\n",
3527                                 atomic_long_read(&event->refcount), event)) {
3528                 /* leak to avoid use-after-free */
3529                 return;
3530         }
3531
3532         _free_event(event);
3533 }
3534
3535 /*
3536  * Remove user event from the owner task.
3537  */
3538 static void perf_remove_from_owner(struct perf_event *event)
3539 {
3540         struct task_struct *owner;
3541
3542         rcu_read_lock();
3543         owner = ACCESS_ONCE(event->owner);
3544         /*
3545          * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3546          * !owner it means the list deletion is complete and we can indeed
3547          * free this event, otherwise we need to serialize on
3548          * owner->perf_event_mutex.
3549          */
3550         smp_read_barrier_depends();
3551         if (owner) {
3552                 /*
3553                  * Since delayed_put_task_struct() also drops the last
3554                  * task reference we can safely take a new reference
3555                  * while holding the rcu_read_lock().
3556                  */
3557                 get_task_struct(owner);
3558         }
3559         rcu_read_unlock();
3560
3561         if (owner) {
3562                 /*
3563                  * If we're here through perf_event_exit_task() we're already
3564                  * holding ctx->mutex which would be an inversion wrt. the
3565                  * normal lock order.
3566                  *
3567                  * However we can safely take this lock because its the child
3568                  * ctx->mutex.
3569                  */
3570                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3571
3572                 /*
3573                  * We have to re-check the event->owner field, if it is cleared
3574                  * we raced with perf_event_exit_task(), acquiring the mutex
3575                  * ensured they're done, and we can proceed with freeing the
3576                  * event.
3577                  */
3578                 if (event->owner)
3579                         list_del_init(&event->owner_entry);
3580                 mutex_unlock(&owner->perf_event_mutex);
3581                 put_task_struct(owner);
3582         }
3583 }
3584
3585 /*
3586  * Called when the last reference to the file is gone.
3587  */
3588 static void put_event(struct perf_event *event)
3589 {
3590         struct perf_event_context *ctx;
3591
3592         if (!atomic_long_dec_and_test(&event->refcount))
3593                 return;
3594
3595         if (!is_kernel_event(event))
3596                 perf_remove_from_owner(event);
3597
3598         /*
3599          * There are two ways this annotation is useful:
3600          *
3601          *  1) there is a lock recursion from perf_event_exit_task
3602          *     see the comment there.
3603          *
3604          *  2) there is a lock-inversion with mmap_sem through
3605          *     perf_event_read_group(), which takes faults while
3606          *     holding ctx->mutex, however this is called after
3607          *     the last filedesc died, so there is no possibility
3608          *     to trigger the AB-BA case.
3609          */
3610         ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3611         WARN_ON_ONCE(ctx->parent_ctx);
3612         perf_remove_from_context(event, true);
3613         mutex_unlock(&ctx->mutex);
3614
3615         _free_event(event);
3616 }
3617
3618 int perf_event_release_kernel(struct perf_event *event)
3619 {
3620         put_event(event);
3621         return 0;
3622 }
3623 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3624
3625 static int perf_release(struct inode *inode, struct file *file)
3626 {
3627         put_event(file->private_data);
3628         return 0;
3629 }
3630
3631 /*
3632  * Remove all orphanes events from the context.
3633  */
3634 static void orphans_remove_work(struct work_struct *work)
3635 {
3636         struct perf_event_context *ctx;
3637         struct perf_event *event, *tmp;
3638
3639         ctx = container_of(work, struct perf_event_context,
3640                            orphans_remove.work);
3641
3642         mutex_lock(&ctx->mutex);
3643         list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3644                 struct perf_event *parent_event = event->parent;
3645
3646                 if (!is_orphaned_child(event))
3647                         continue;
3648
3649                 perf_remove_from_context(event, true);
3650
3651                 mutex_lock(&parent_event->child_mutex);
3652                 list_del_init(&event->child_list);
3653                 mutex_unlock(&parent_event->child_mutex);
3654
3655                 free_event(event);
3656                 put_event(parent_event);
3657         }
3658
3659         raw_spin_lock_irq(&ctx->lock);
3660         ctx->orphans_remove_sched = false;
3661         raw_spin_unlock_irq(&ctx->lock);
3662         mutex_unlock(&ctx->mutex);
3663
3664         put_ctx(ctx);
3665 }
3666
3667 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3668 {
3669         struct perf_event *child;
3670         u64 total = 0;
3671
3672         *enabled = 0;
3673         *running = 0;
3674
3675         mutex_lock(&event->child_mutex);
3676         total += perf_event_read(event);
3677         *enabled += event->total_time_enabled +
3678                         atomic64_read(&event->child_total_time_enabled);
3679         *running += event->total_time_running +
3680                         atomic64_read(&event->child_total_time_running);
3681
3682         list_for_each_entry(child, &event->child_list, child_list) {
3683                 total += perf_event_read(child);
3684                 *enabled += child->total_time_enabled;
3685                 *running += child->total_time_running;
3686         }
3687         mutex_unlock(&event->child_mutex);
3688
3689         return total;
3690 }
3691 EXPORT_SYMBOL_GPL(perf_event_read_value);
3692
3693 static int perf_event_read_group(struct perf_event *event,
3694                                    u64 read_format, char __user *buf)
3695 {
3696         struct perf_event *leader = event->group_leader, *sub;
3697         struct perf_event_context *ctx = leader->ctx;
3698         int n = 0, size = 0, ret;
3699         u64 count, enabled, running;
3700         u64 values[5];
3701
3702         lockdep_assert_held(&ctx->mutex);
3703
3704         count = perf_event_read_value(leader, &enabled, &running);
3705
3706         values[n++] = 1 + leader->nr_siblings;
3707         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3708                 values[n++] = enabled;
3709         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3710                 values[n++] = running;
3711         values[n++] = count;
3712         if (read_format & PERF_FORMAT_ID)
3713                 values[n++] = primary_event_id(leader);
3714
3715         size = n * sizeof(u64);
3716
3717         if (copy_to_user(buf, values, size))
3718                 return -EFAULT;
3719
3720         ret = size;
3721
3722         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3723                 n = 0;
3724
3725                 values[n++] = perf_event_read_value(sub, &enabled, &running);
3726                 if (read_format & PERF_FORMAT_ID)
3727                         values[n++] = primary_event_id(sub);
3728
3729                 size = n * sizeof(u64);
3730
3731                 if (copy_to_user(buf + ret, values, size)) {
3732                         return -EFAULT;
3733                 }
3734
3735                 ret += size;
3736         }
3737
3738         return ret;
3739 }
3740
3741 static int perf_event_read_one(struct perf_event *event,
3742                                  u64 read_format, char __user *buf)
3743 {
3744         u64 enabled, running;
3745         u64 values[4];
3746         int n = 0;
3747
3748         values[n++] = perf_event_read_value(event, &enabled, &running);
3749         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3750                 values[n++] = enabled;
3751         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3752                 values[n++] = running;
3753         if (read_format & PERF_FORMAT_ID)
3754                 values[n++] = primary_event_id(event);
3755
3756         if (copy_to_user(buf, values, n * sizeof(u64)))
3757                 return -EFAULT;
3758
3759         return n * sizeof(u64);
3760 }
3761
3762 static bool is_event_hup(struct perf_event *event)
3763 {
3764         bool no_children;
3765
3766         if (event->state != PERF_EVENT_STATE_EXIT)
3767                 return false;
3768
3769         mutex_lock(&event->child_mutex);
3770         no_children = list_empty(&event->child_list);
3771         mutex_unlock(&event->child_mutex);
3772         return no_children;
3773 }
3774
3775 /*
3776  * Read the performance event - simple non blocking version for now
3777  */
3778 static ssize_t
3779 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3780 {
3781         u64 read_format = event->attr.read_format;
3782         int ret;
3783
3784         /*
3785          * Return end-of-file for a read on a event that is in
3786          * error state (i.e. because it was pinned but it couldn't be
3787          * scheduled on to the CPU at some point).
3788          */
3789         if (event->state == PERF_EVENT_STATE_ERROR)
3790                 return 0;
3791
3792         if (count < event->read_size)
3793                 return -ENOSPC;
3794
3795         WARN_ON_ONCE(event->ctx->parent_ctx);
3796         if (read_format & PERF_FORMAT_GROUP)
3797                 ret = perf_event_read_group(event, read_format, buf);
3798         else
3799                 ret = perf_event_read_one(event, read_format, buf);
3800
3801         return ret;
3802 }
3803
3804 static ssize_t
3805 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3806 {
3807         struct perf_event *event = file->private_data;
3808         struct perf_event_context *ctx;
3809         int ret;
3810
3811         ctx = perf_event_ctx_lock(event);
3812         ret = perf_read_hw(event, buf, count);
3813         perf_event_ctx_unlock(event, ctx);
3814
3815         return ret;
3816 }
3817
3818 static unsigned int perf_poll(struct file *file, poll_table *wait)
3819 {
3820         struct perf_event *event = file->private_data;
3821         struct ring_buffer *rb;
3822         unsigned int events = POLLHUP;
3823
3824         poll_wait(file, &event->waitq, wait);
3825
3826         if (is_event_hup(event))
3827                 return events;
3828
3829         /*
3830          * Pin the event->rb by taking event->mmap_mutex; otherwise
3831          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3832          */
3833         mutex_lock(&event->mmap_mutex);
3834         rb = event->rb;
3835         if (rb)
3836                 events = atomic_xchg(&rb->poll, 0);
3837         mutex_unlock(&event->mmap_mutex);
3838         return events;
3839 }
3840
3841 static void _perf_event_reset(struct perf_event *event)
3842 {
3843         (void)perf_event_read(event);
3844         local64_set(&event->count, 0);
3845         perf_event_update_userpage(event);
3846 }
3847
3848 /*
3849  * Holding the top-level event's child_mutex means that any
3850  * descendant process that has inherited this event will block
3851  * in sync_child_event if it goes to exit, thus satisfying the
3852  * task existence requirements of perf_event_enable/disable.
3853  */
3854 static void perf_event_for_each_child(struct perf_event *event,
3855                                         void (*func)(struct perf_event *))
3856 {
3857         struct perf_event *child;
3858
3859         WARN_ON_ONCE(event->ctx->parent_ctx);
3860
3861         mutex_lock(&event->child_mutex);
3862         func(event);
3863         list_for_each_entry(child, &event->child_list, child_list)
3864                 func(child);
3865         mutex_unlock(&event->child_mutex);
3866 }
3867
3868 static void perf_event_for_each(struct perf_event *event,
3869                                   void (*func)(struct perf_event *))
3870 {
3871         struct perf_event_context *ctx = event->ctx;
3872         struct perf_event *sibling;
3873
3874         lockdep_assert_held(&ctx->mutex);
3875
3876         event = event->group_leader;
3877
3878         perf_event_for_each_child(event, func);
3879         list_for_each_entry(sibling, &event->sibling_list, group_entry)
3880                 perf_event_for_each_child(sibling, func);
3881 }
3882
3883 static int perf_event_period(struct perf_event *event, u64 __user *arg)
3884 {
3885         struct perf_event_context *ctx = event->ctx;
3886         int ret = 0, active;
3887         u64 value;
3888
3889         if (!is_sampling_event(event))
3890                 return -EINVAL;
3891
3892         if (copy_from_user(&value, arg, sizeof(value)))
3893                 return -EFAULT;
3894
3895         if (!value)
3896                 return -EINVAL;
3897
3898         raw_spin_lock_irq(&ctx->lock);
3899         if (event->attr.freq) {
3900                 if (value > sysctl_perf_event_sample_rate) {
3901                         ret = -EINVAL;
3902                         goto unlock;
3903                 }
3904
3905                 event->attr.sample_freq = value;
3906         } else {
3907                 event->attr.sample_period = value;
3908                 event->hw.sample_period = value;
3909         }
3910
3911         active = (event->state == PERF_EVENT_STATE_ACTIVE);
3912         if (active) {
3913                 perf_pmu_disable(ctx->pmu);
3914                 event->pmu->stop(event, PERF_EF_UPDATE);
3915         }
3916
3917         local64_set(&event->hw.period_left, 0);
3918
3919         if (active) {
3920                 event->pmu->start(event, PERF_EF_RELOAD);
3921                 perf_pmu_enable(ctx->pmu);
3922         }
3923
3924 unlock:
3925         raw_spin_unlock_irq(&ctx->lock);
3926
3927         return ret;
3928 }
3929
3930 static const struct file_operations perf_fops;
3931
3932 static inline int perf_fget_light(int fd, struct fd *p)
3933 {
3934         struct fd f = fdget(fd);
3935         if (!f.file)
3936                 return -EBADF;
3937
3938         if (f.file->f_op != &perf_fops) {
3939                 fdput(f);
3940                 return -EBADF;
3941         }
3942         *p = f;
3943         return 0;
3944 }
3945
3946 static int perf_event_set_output(struct perf_event *event,
3947                                  struct perf_event *output_event);
3948 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3949
3950 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3951 {
3952         void (*func)(struct perf_event *);
3953         u32 flags = arg;
3954
3955         switch (cmd) {
3956         case PERF_EVENT_IOC_ENABLE:
3957                 func = _perf_event_enable;
3958                 break;
3959         case PERF_EVENT_IOC_DISABLE:
3960                 func = _perf_event_disable;
3961                 break;
3962         case PERF_EVENT_IOC_RESET:
3963                 func = _perf_event_reset;
3964                 break;
3965
3966         case PERF_EVENT_IOC_REFRESH:
3967                 return _perf_event_refresh(event, arg);
3968
3969         case PERF_EVENT_IOC_PERIOD:
3970                 return perf_event_period(event, (u64 __user *)arg);
3971
3972         case PERF_EVENT_IOC_ID:
3973         {
3974                 u64 id = primary_event_id(event);
3975
3976                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3977                         return -EFAULT;
3978                 return 0;
3979         }
3980
3981         case PERF_EVENT_IOC_SET_OUTPUT:
3982         {
3983                 int ret;
3984                 if (arg != -1) {
3985                         struct perf_event *output_event;
3986                         struct fd output;
3987                         ret = perf_fget_light(arg, &output);
3988                         if (ret)
3989                                 return ret;
3990                         output_event = output.file->private_data;
3991                         ret = perf_event_set_output(event, output_event);
3992                         fdput(output);
3993                 } else {
3994                         ret = perf_event_set_output(event, NULL);
3995                 }
3996                 return ret;
3997         }
3998
3999         case PERF_EVENT_IOC_SET_FILTER:
4000                 return perf_event_set_filter(event, (void __user *)arg);
4001
4002         default:
4003                 return -ENOTTY;
4004         }
4005
4006         if (flags & PERF_IOC_FLAG_GROUP)
4007                 perf_event_for_each(event, func);
4008         else
4009                 perf_event_for_each_child(event, func);
4010
4011         return 0;
4012 }
4013
4014 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4015 {
4016         struct perf_event *event = file->private_data;
4017         struct perf_event_context *ctx;
4018         long ret;
4019
4020         ctx = perf_event_ctx_lock(event);
4021         ret = _perf_ioctl(event, cmd, arg);
4022         perf_event_ctx_unlock(event, ctx);
4023
4024         return ret;
4025 }
4026
4027 #ifdef CONFIG_COMPAT
4028 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4029                                 unsigned long arg)
4030 {
4031         switch (_IOC_NR(cmd)) {
4032         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4033         case _IOC_NR(PERF_EVENT_IOC_ID):
4034                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4035                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4036                         cmd &= ~IOCSIZE_MASK;
4037                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4038                 }
4039                 break;
4040         }
4041         return perf_ioctl(file, cmd, arg);
4042 }
4043 #else
4044 # define perf_compat_ioctl NULL
4045 #endif
4046
4047 int perf_event_task_enable(void)
4048 {
4049         struct perf_event_context *ctx;
4050         struct perf_event *event;
4051
4052         mutex_lock(&current->perf_event_mutex);
4053         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4054                 ctx = perf_event_ctx_lock(event);
4055                 perf_event_for_each_child(event, _perf_event_enable);
4056                 perf_event_ctx_unlock(event, ctx);
4057         }
4058         mutex_unlock(&current->perf_event_mutex);
4059
4060         return 0;
4061 }
4062
4063 int perf_event_task_disable(void)
4064 {
4065         struct perf_event_context *ctx;
4066         struct perf_event *event;
4067
4068         mutex_lock(&current->perf_event_mutex);
4069         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4070                 ctx = perf_event_ctx_lock(event);
4071                 perf_event_for_each_child(event, _perf_event_disable);
4072                 perf_event_ctx_unlock(event, ctx);
4073         }
4074         mutex_unlock(&current->perf_event_mutex);
4075
4076         return 0;
4077 }
4078
4079 static int perf_event_index(struct perf_event *event)
4080 {
4081         if (event->hw.state & PERF_HES_STOPPED)
4082                 return 0;
4083
4084         if (event->state != PERF_EVENT_STATE_ACTIVE)
4085                 return 0;
4086
4087         return event->pmu->event_idx(event);
4088 }
4089
4090 static void calc_timer_values(struct perf_event *event,
4091                                 u64 *now,
4092                                 u64 *enabled,
4093                                 u64 *running)
4094 {
4095         u64 ctx_time;
4096
4097         *now = perf_clock();
4098         ctx_time = event->shadow_ctx_time + *now;
4099         *enabled = ctx_time - event->tstamp_enabled;
4100         *running = ctx_time - event->tstamp_running;
4101 }
4102
4103 static void perf_event_init_userpage(struct perf_event *event)
4104 {
4105         struct perf_event_mmap_page *userpg;
4106         struct ring_buffer *rb;
4107
4108         rcu_read_lock();
4109         rb = rcu_dereference(event->rb);
4110         if (!rb)
4111                 goto unlock;
4112
4113         userpg = rb->user_page;
4114
4115         /* Allow new userspace to detect that bit 0 is deprecated */
4116         userpg->cap_bit0_is_deprecated = 1;
4117         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4118
4119 unlock:
4120         rcu_read_unlock();
4121 }
4122
4123 void __weak arch_perf_update_userpage(
4124         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4125 {
4126 }
4127
4128 /*
4129  * Callers need to ensure there can be no nesting of this function, otherwise
4130  * the seqlock logic goes bad. We can not serialize this because the arch
4131  * code calls this from NMI context.
4132  */
4133 void perf_event_update_userpage(struct perf_event *event)
4134 {
4135         struct perf_event_mmap_page *userpg;
4136         struct ring_buffer *rb;
4137         u64 enabled, running, now;
4138
4139         rcu_read_lock();
4140         rb = rcu_dereference(event->rb);
4141         if (!rb)
4142                 goto unlock;
4143
4144         /*
4145          * compute total_time_enabled, total_time_running
4146          * based on snapshot values taken when the event
4147          * was last scheduled in.
4148          *
4149          * we cannot simply called update_context_time()
4150          * because of locking issue as we can be called in
4151          * NMI context
4152          */
4153         calc_timer_values(event, &now, &enabled, &running);
4154
4155         userpg = rb->user_page;
4156         /*
4157          * Disable preemption so as to not let the corresponding user-space
4158          * spin too long if we get preempted.
4159          */
4160         preempt_disable();
4161         ++userpg->lock;
4162         barrier();
4163         userpg->index = perf_event_index(event);
4164         userpg->offset = perf_event_count(event);
4165         if (userpg->index)
4166                 userpg->offset -= local64_read(&event->hw.prev_count);
4167
4168         userpg->time_enabled = enabled +
4169                         atomic64_read(&event->child_total_time_enabled);
4170
4171         userpg->time_running = running +
4172                         atomic64_read(&event->child_total_time_running);
4173
4174         arch_perf_update_userpage(event, userpg, now);
4175
4176         barrier();
4177         ++userpg->lock;
4178         preempt_enable();
4179 unlock:
4180         rcu_read_unlock();
4181 }
4182
4183 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4184 {
4185         struct perf_event *event = vma->vm_file->private_data;
4186         struct ring_buffer *rb;
4187         int ret = VM_FAULT_SIGBUS;
4188
4189         if (vmf->flags & FAULT_FLAG_MKWRITE) {
4190                 if (vmf->pgoff == 0)
4191                         ret = 0;
4192                 return ret;
4193         }
4194
4195         rcu_read_lock();
4196         rb = rcu_dereference(event->rb);
4197         if (!rb)
4198                 goto unlock;
4199
4200         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4201                 goto unlock;
4202
4203         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4204         if (!vmf->page)
4205                 goto unlock;
4206
4207         get_page(vmf->page);
4208         vmf->page->mapping = vma->vm_file->f_mapping;
4209         vmf->page->index   = vmf->pgoff;
4210
4211         ret = 0;
4212 unlock:
4213         rcu_read_unlock();
4214
4215         return ret;
4216 }
4217
4218 static void ring_buffer_attach(struct perf_event *event,
4219                                struct ring_buffer *rb)
4220 {
4221         struct ring_buffer *old_rb = NULL;
4222         unsigned long flags;
4223
4224         if (event->rb) {
4225                 /*
4226                  * Should be impossible, we set this when removing
4227                  * event->rb_entry and wait/clear when adding event->rb_entry.
4228                  */
4229                 WARN_ON_ONCE(event->rcu_pending);
4230
4231                 old_rb = event->rb;
4232                 event->rcu_batches = get_state_synchronize_rcu();
4233                 event->rcu_pending = 1;
4234
4235                 spin_lock_irqsave(&old_rb->event_lock, flags);
4236                 list_del_rcu(&event->rb_entry);
4237                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4238         }
4239
4240         if (event->rcu_pending && rb) {
4241                 cond_synchronize_rcu(event->rcu_batches);
4242                 event->rcu_pending = 0;
4243         }
4244
4245         if (rb) {
4246                 spin_lock_irqsave(&rb->event_lock, flags);
4247                 list_add_rcu(&event->rb_entry, &rb->event_list);
4248                 spin_unlock_irqrestore(&rb->event_lock, flags);
4249         }
4250
4251         rcu_assign_pointer(event->rb, rb);
4252
4253         if (old_rb) {
4254                 ring_buffer_put(old_rb);
4255                 /*
4256                  * Since we detached before setting the new rb, so that we
4257                  * could attach the new rb, we could have missed a wakeup.
4258                  * Provide it now.
4259                  */
4260                 wake_up_all(&event->waitq);
4261         }
4262 }
4263
4264 static void ring_buffer_wakeup(struct perf_event *event)
4265 {
4266         struct ring_buffer *rb;
4267
4268         rcu_read_lock();
4269         rb = rcu_dereference(event->rb);
4270         if (rb) {
4271                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4272                         wake_up_all(&event->waitq);
4273         }
4274         rcu_read_unlock();
4275 }
4276
4277 static void rb_free_rcu(struct rcu_head *rcu_head)
4278 {
4279         struct ring_buffer *rb;
4280
4281         rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4282         rb_free(rb);
4283 }
4284
4285 static struct ring_buffer *ring_buffer_get(struct perf_event *event)
4286 {
4287         struct ring_buffer *rb;
4288
4289         rcu_read_lock();
4290         rb = rcu_dereference(event->rb);
4291         if (rb) {
4292                 if (!atomic_inc_not_zero(&rb->refcount))
4293                         rb = NULL;
4294         }
4295         rcu_read_unlock();
4296
4297         return rb;
4298 }
4299
4300 static void ring_buffer_put(struct ring_buffer *rb)
4301 {
4302         if (!atomic_dec_and_test(&rb->refcount))
4303                 return;
4304
4305         WARN_ON_ONCE(!list_empty(&rb->event_list));
4306
4307         call_rcu(&rb->rcu_head, rb_free_rcu);
4308 }
4309
4310 static void perf_mmap_open(struct vm_area_struct *vma)
4311 {
4312         struct perf_event *event = vma->vm_file->private_data;
4313
4314         atomic_inc(&event->mmap_count);
4315         atomic_inc(&event->rb->mmap_count);
4316
4317         if (event->pmu->event_mapped)
4318                 event->pmu->event_mapped(event);
4319 }
4320
4321 /*
4322  * A buffer can be mmap()ed multiple times; either directly through the same
4323  * event, or through other events by use of perf_event_set_output().
4324  *
4325  * In order to undo the VM accounting done by perf_mmap() we need to destroy
4326  * the buffer here, where we still have a VM context. This means we need
4327  * to detach all events redirecting to us.
4328  */
4329 static void perf_mmap_close(struct vm_area_struct *vma)
4330 {
4331         struct perf_event *event = vma->vm_file->private_data;
4332
4333         struct ring_buffer *rb = ring_buffer_get(event);
4334         struct user_struct *mmap_user = rb->mmap_user;
4335         int mmap_locked = rb->mmap_locked;
4336         unsigned long size = perf_data_size(rb);
4337
4338         if (event->pmu->event_unmapped)
4339                 event->pmu->event_unmapped(event);
4340
4341         atomic_dec(&rb->mmap_count);
4342
4343         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4344                 goto out_put;
4345
4346         ring_buffer_attach(event, NULL);
4347         mutex_unlock(&event->mmap_mutex);
4348
4349         /* If there's still other mmap()s of this buffer, we're done. */
4350         if (atomic_read(&rb->mmap_count))
4351                 goto out_put;
4352
4353         /*
4354          * No other mmap()s, detach from all other events that might redirect
4355          * into the now unreachable buffer. Somewhat complicated by the
4356          * fact that rb::event_lock otherwise nests inside mmap_mutex.
4357          */
4358 again:
4359         rcu_read_lock();
4360         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4361                 if (!atomic_long_inc_not_zero(&event->refcount)) {
4362                         /*
4363                          * This event is en-route to free_event() which will
4364                          * detach it and remove it from the list.
4365                          */
4366                         continue;
4367                 }
4368                 rcu_read_unlock();
4369
4370                 mutex_lock(&event->mmap_mutex);
4371                 /*
4372                  * Check we didn't race with perf_event_set_output() which can
4373                  * swizzle the rb from under us while we were waiting to
4374                  * acquire mmap_mutex.
4375                  *
4376                  * If we find a different rb; ignore this event, a next
4377                  * iteration will no longer find it on the list. We have to
4378                  * still restart the iteration to make sure we're not now
4379                  * iterating the wrong list.
4380                  */
4381                 if (event->rb == rb)
4382                         ring_buffer_attach(event, NULL);
4383
4384                 mutex_unlock(&event->mmap_mutex);
4385                 put_event(event);
4386
4387                 /*
4388                  * Restart the iteration; either we're on the wrong list or
4389                  * destroyed its integrity by doing a deletion.
4390                  */
4391                 goto again;
4392         }
4393         rcu_read_unlock();
4394
4395         /*
4396          * It could be there's still a few 0-ref events on the list; they'll
4397          * get cleaned up by free_event() -- they'll also still have their
4398          * ref on the rb and will free it whenever they are done with it.
4399          *
4400          * Aside from that, this buffer is 'fully' detached and unmapped,
4401          * undo the VM accounting.
4402          */
4403
4404         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4405         vma->vm_mm->pinned_vm -= mmap_locked;
4406         free_uid(mmap_user);
4407
4408 out_put:
4409         ring_buffer_put(rb); /* could be last */
4410 }
4411
4412 static const struct vm_operations_struct perf_mmap_vmops = {
4413         .open           = perf_mmap_open,
4414         .close          = perf_mmap_close,
4415         .fault          = perf_mmap_fault,
4416         .page_mkwrite   = perf_mmap_fault,
4417 };
4418
4419 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4420 {
4421         struct perf_event *event = file->private_data;
4422         unsigned long user_locked, user_lock_limit;
4423         struct user_struct *user = current_user();
4424         unsigned long locked, lock_limit;
4425         struct ring_buffer *rb;
4426         unsigned long vma_size;
4427         unsigned long nr_pages;
4428         long user_extra, extra;
4429         int ret = 0, flags = 0;
4430
4431         /*
4432          * Don't allow mmap() of inherited per-task counters. This would
4433          * create a performance issue due to all children writing to the
4434          * same rb.
4435          */
4436         if (event->cpu == -1 && event->attr.inherit)
4437                 return -EINVAL;
4438
4439         if (!(vma->vm_flags & VM_SHARED))
4440                 return -EINVAL;
4441
4442         vma_size = vma->vm_end - vma->vm_start;
4443         nr_pages = (vma_size / PAGE_SIZE) - 1;
4444
4445         /*
4446          * If we have rb pages ensure they're a power-of-two number, so we
4447          * can do bitmasks instead of modulo.
4448          */
4449         if (!is_power_of_2(nr_pages))
4450                 return -EINVAL;
4451
4452         if (vma_size != PAGE_SIZE * (1 + nr_pages))
4453                 return -EINVAL;
4454
4455         if (vma->vm_pgoff != 0)
4456                 return -EINVAL;
4457
4458         WARN_ON_ONCE(event->ctx->parent_ctx);
4459 again:
4460         mutex_lock(&event->mmap_mutex);
4461         if (event->rb) {
4462                 if (event->rb->nr_pages != nr_pages) {
4463                         ret = -EINVAL;
4464                         goto unlock;
4465                 }
4466
4467                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4468                         /*
4469                          * Raced against perf_mmap_close() through
4470                          * perf_event_set_output(). Try again, hope for better
4471                          * luck.
4472                          */
4473                         mutex_unlock(&event->mmap_mutex);
4474                         goto again;
4475                 }
4476
4477                 goto unlock;
4478         }
4479
4480         user_extra = nr_pages + 1;
4481         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4482
4483         /*
4484          * Increase the limit linearly with more CPUs:
4485          */
4486         user_lock_limit *= num_online_cpus();
4487
4488         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4489
4490         extra = 0;
4491         if (user_locked > user_lock_limit)
4492                 extra = user_locked - user_lock_limit;
4493
4494         lock_limit = rlimit(RLIMIT_MEMLOCK);
4495         lock_limit >>= PAGE_SHIFT;
4496         locked = vma->vm_mm->pinned_vm + extra;
4497
4498         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4499                 !capable(CAP_IPC_LOCK)) {
4500                 ret = -EPERM;
4501                 goto unlock;
4502         }
4503
4504         WARN_ON(event->rb);
4505
4506         if (vma->vm_flags & VM_WRITE)
4507                 flags |= RING_BUFFER_WRITABLE;
4508
4509         rb = rb_alloc(nr_pages,
4510                 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4511                 event->cpu, flags);
4512
4513         if (!rb) {
4514                 ret = -ENOMEM;
4515                 goto unlock;
4516         }
4517
4518         atomic_set(&rb->mmap_count, 1);
4519         rb->mmap_locked = extra;
4520         rb->mmap_user = get_current_user();
4521
4522         atomic_long_add(user_extra, &user->locked_vm);
4523         vma->vm_mm->pinned_vm += extra;
4524
4525         ring_buffer_attach(event, rb);
4526
4527         perf_event_init_userpage(event);
4528         perf_event_update_userpage(event);
4529
4530 unlock:
4531         if (!ret)
4532                 atomic_inc(&event->mmap_count);
4533         mutex_unlock(&event->mmap_mutex);
4534
4535         /*
4536          * Since pinned accounting is per vm we cannot allow fork() to copy our
4537          * vma.
4538          */
4539         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4540         vma->vm_ops = &perf_mmap_vmops;
4541
4542         if (event->pmu->event_mapped)
4543                 event->pmu->event_mapped(event);
4544
4545         return ret;
4546 }
4547
4548 static int perf_fasync(int fd, struct file *filp, int on)
4549 {
4550         struct inode *inode = file_inode(filp);
4551         struct perf_event *event = filp->private_data;
4552         int retval;
4553
4554         mutex_lock(&inode->i_mutex);
4555         retval = fasync_helper(fd, filp, on, &event->fasync);
4556         mutex_unlock(&inode->i_mutex);
4557
4558         if (retval < 0)
4559                 return retval;
4560
4561         return 0;
4562 }
4563
4564 static const struct file_operations perf_fops = {
4565         .llseek                 = no_llseek,
4566         .release                = perf_release,
4567         .read                   = perf_read,
4568         .poll                   = perf_poll,
4569         .unlocked_ioctl         = perf_ioctl,
4570         .compat_ioctl           = perf_compat_ioctl,
4571         .mmap                   = perf_mmap,
4572         .fasync                 = perf_fasync,
4573 };
4574
4575 /*
4576  * Perf event wakeup
4577  *
4578  * If there's data, ensure we set the poll() state and publish everything
4579  * to user-space before waking everybody up.
4580  */
4581
4582 void perf_event_wakeup(struct perf_event *event)
4583 {
4584         ring_buffer_wakeup(event);
4585
4586         if (event->pending_kill) {
4587                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
4588                 event->pending_kill = 0;
4589         }
4590 }
4591
4592 static void perf_pending_event(struct irq_work *entry)
4593 {
4594         struct perf_event *event = container_of(entry,
4595                         struct perf_event, pending);
4596
4597         if (event->pending_disable) {
4598                 event->pending_disable = 0;
4599                 __perf_event_disable(event);
4600         }
4601
4602         if (event->pending_wakeup) {
4603                 event->pending_wakeup = 0;
4604                 perf_event_wakeup(event);
4605         }
4606 }
4607
4608 /*
4609  * We assume there is only KVM supporting the callbacks.
4610  * Later on, we might change it to a list if there is
4611  * another virtualization implementation supporting the callbacks.
4612  */
4613 struct perf_guest_info_callbacks *perf_guest_cbs;
4614
4615 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4616 {
4617         perf_guest_cbs = cbs;
4618         return 0;
4619 }
4620 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4621
4622 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4623 {
4624         perf_guest_cbs = NULL;
4625         return 0;
4626 }
4627 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4628
4629 static void
4630 perf_output_sample_regs(struct perf_output_handle *handle,
4631                         struct pt_regs *regs, u64 mask)
4632 {
4633         int bit;
4634
4635         for_each_set_bit(bit, (const unsigned long *) &mask,
4636                          sizeof(mask) * BITS_PER_BYTE) {
4637                 u64 val;
4638
4639                 val = perf_reg_value(regs, bit);
4640                 perf_output_put(handle, val);
4641         }
4642 }
4643
4644 static void perf_sample_regs_user(struct perf_regs *regs_user,
4645                                   struct pt_regs *regs,
4646                                   struct pt_regs *regs_user_copy)
4647 {
4648         if (user_mode(regs)) {
4649                 regs_user->abi = perf_reg_abi(current);
4650                 regs_user->regs = regs;
4651         } else if (current->mm) {
4652                 perf_get_regs_user(regs_user, regs, regs_user_copy);
4653         } else {
4654                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4655                 regs_user->regs = NULL;
4656         }
4657 }
4658
4659 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4660                                   struct pt_regs *regs)
4661 {
4662         regs_intr->regs = regs;
4663         regs_intr->abi  = perf_reg_abi(current);
4664 }
4665
4666
4667 /*
4668  * Get remaining task size from user stack pointer.
4669  *
4670  * It'd be better to take stack vma map and limit this more
4671  * precisly, but there's no way to get it safely under interrupt,
4672  * so using TASK_SIZE as limit.
4673  */
4674 static u64 perf_ustack_task_size(struct pt_regs *regs)
4675 {
4676         unsigned long addr = perf_user_stack_pointer(regs);
4677
4678         if (!addr || addr >= TASK_SIZE)
4679                 return 0;
4680
4681         return TASK_SIZE - addr;
4682 }
4683
4684 static u16
4685 perf_sample_ustack_size(u16 stack_size, u16 header_size,
4686                         struct pt_regs *regs)
4687 {
4688         u64 task_size;
4689
4690         /* No regs, no stack pointer, no dump. */
4691         if (!regs)
4692                 return 0;
4693
4694         /*
4695          * Check if we fit in with the requested stack size into the:
4696          * - TASK_SIZE
4697          *   If we don't, we limit the size to the TASK_SIZE.
4698          *
4699          * - remaining sample size
4700          *   If we don't, we customize the stack size to
4701          *   fit in to the remaining sample size.
4702          */
4703
4704         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4705         stack_size = min(stack_size, (u16) task_size);
4706
4707         /* Current header size plus static size and dynamic size. */
4708         header_size += 2 * sizeof(u64);
4709
4710         /* Do we fit in with the current stack dump size? */
4711         if ((u16) (header_size + stack_size) < header_size) {
4712                 /*
4713                  * If we overflow the maximum size for the sample,
4714                  * we customize the stack dump size to fit in.
4715                  */
4716                 stack_size = USHRT_MAX - header_size - sizeof(u64);
4717                 stack_size = round_up(stack_size, sizeof(u64));
4718         }
4719
4720         return stack_size;
4721 }
4722
4723 static void
4724 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4725                           struct pt_regs *regs)
4726 {
4727         /* Case of a kernel thread, nothing to dump */
4728         if (!regs) {
4729                 u64 size = 0;
4730                 perf_output_put(handle, size);
4731         } else {
4732                 unsigned long sp;
4733                 unsigned int rem;
4734                 u64 dyn_size;
4735
4736                 /*
4737                  * We dump:
4738                  * static size
4739                  *   - the size requested by user or the best one we can fit
4740                  *     in to the sample max size
4741                  * data
4742                  *   - user stack dump data
4743                  * dynamic size
4744                  *   - the actual dumped size
4745                  */
4746
4747                 /* Static size. */
4748                 perf_output_put(handle, dump_size);
4749
4750                 /* Data. */
4751                 sp = perf_user_stack_pointer(regs);
4752                 rem = __output_copy_user(handle, (void *) sp, dump_size);
4753                 dyn_size = dump_size - rem;
4754
4755                 perf_output_skip(handle, rem);
4756
4757                 /* Dynamic size. */
4758                 perf_output_put(handle, dyn_size);
4759         }
4760 }
4761
4762 static void __perf_event_header__init_id(struct perf_event_header *header,
4763                                          struct perf_sample_data *data,
4764                                          struct perf_event *event)
4765 {
4766         u64 sample_type = event->attr.sample_type;
4767
4768         data->type = sample_type;
4769         header->size += event->id_header_size;
4770
4771         if (sample_type & PERF_SAMPLE_TID) {
4772                 /* namespace issues */
4773                 data->tid_entry.pid = perf_event_pid(event, current);
4774                 data->tid_entry.tid = perf_event_tid(event, current);
4775         }
4776
4777         if (sample_type & PERF_SAMPLE_TIME)
4778                 data->time = perf_clock();
4779
4780         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4781                 data->id = primary_event_id(event);
4782
4783         if (sample_type & PERF_SAMPLE_STREAM_ID)
4784                 data->stream_id = event->id;
4785
4786         if (sample_type & PERF_SAMPLE_CPU) {
4787                 data->cpu_entry.cpu      = raw_smp_processor_id();
4788                 data->cpu_entry.reserved = 0;
4789         }
4790 }
4791
4792 void perf_event_header__init_id(struct perf_event_header *header,
4793                                 struct perf_sample_data *data,
4794                                 struct perf_event *event)
4795 {
4796         if (event->attr.sample_id_all)
4797                 __perf_event_header__init_id(header, data, event);
4798 }
4799
4800 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4801                                            struct perf_sample_data *data)
4802 {
4803         u64 sample_type = data->type;
4804
4805         if (sample_type & PERF_SAMPLE_TID)
4806                 perf_output_put(handle, data->tid_entry);
4807
4808         if (sample_type & PERF_SAMPLE_TIME)
4809                 perf_output_put(handle, data->time);
4810
4811         if (sample_type & PERF_SAMPLE_ID)
4812                 perf_output_put(handle, data->id);
4813
4814         if (sample_type & PERF_SAMPLE_STREAM_ID)
4815                 perf_output_put(handle, data->stream_id);
4816
4817         if (sample_type & PERF_SAMPLE_CPU)
4818                 perf_output_put(handle, data->cpu_entry);
4819
4820         if (sample_type & PERF_SAMPLE_IDENTIFIER)
4821                 perf_output_put(handle, data->id);
4822 }
4823
4824 void perf_event__output_id_sample(struct perf_event *event,
4825                                   struct perf_output_handle *handle,
4826                                   struct perf_sample_data *sample)
4827 {
4828         if (event->attr.sample_id_all)
4829                 __perf_event__output_id_sample(handle, sample);
4830 }
4831
4832 static void perf_output_read_one(struct perf_output_handle *handle,
4833                                  struct perf_event *event,
4834                                  u64 enabled, u64 running)
4835 {
4836         u64 read_format = event->attr.read_format;
4837         u64 values[4];
4838         int n = 0;
4839
4840         values[n++] = perf_event_count(event);
4841         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4842                 values[n++] = enabled +
4843                         atomic64_read(&event->child_total_time_enabled);
4844         }
4845         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4846                 values[n++] = running +
4847                         atomic64_read(&event->child_total_time_running);
4848         }
4849         if (read_format & PERF_FORMAT_ID)
4850                 values[n++] = primary_event_id(event);
4851
4852         __output_copy(handle, values, n * sizeof(u64));
4853 }
4854
4855 /*
4856  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4857  */
4858 static void perf_output_read_group(struct perf_output_handle *handle,
4859                             struct perf_event *event,
4860                             u64 enabled, u64 running)
4861 {
4862         struct perf_event *leader = event->group_leader, *sub;
4863         u64 read_format = event->attr.read_format;
4864         u64 values[5];
4865         int n = 0;
4866
4867         values[n++] = 1 + leader->nr_siblings;
4868
4869         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4870                 values[n++] = enabled;
4871
4872         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4873                 values[n++] = running;
4874
4875         if (leader != event)
4876                 leader->pmu->read(leader);
4877
4878         values[n++] = perf_event_count(leader);
4879         if (read_format & PERF_FORMAT_ID)
4880                 values[n++] = primary_event_id(leader);
4881
4882         __output_copy(handle, values, n * sizeof(u64));
4883
4884         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4885                 n = 0;
4886
4887                 if ((sub != event) &&
4888                     (sub->state == PERF_EVENT_STATE_ACTIVE))
4889                         sub->pmu->read(sub);
4890
4891                 values[n++] = perf_event_count(sub);
4892                 if (read_format & PERF_FORMAT_ID)
4893                         values[n++] = primary_event_id(sub);
4894
4895                 __output_copy(handle, values, n * sizeof(u64));
4896         }
4897 }
4898
4899 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4900                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
4901
4902 static void perf_output_read(struct perf_output_handle *handle,
4903                              struct perf_event *event)
4904 {
4905         u64 enabled = 0, running = 0, now;
4906         u64 read_format = event->attr.read_format;
4907
4908         /*
4909          * compute total_time_enabled, total_time_running
4910          * based on snapshot values taken when the event
4911          * was last scheduled in.
4912          *
4913          * we cannot simply called update_context_time()
4914          * because of locking issue as we are called in
4915          * NMI context
4916          */
4917         if (read_format & PERF_FORMAT_TOTAL_TIMES)
4918                 calc_timer_values(event, &now, &enabled, &running);
4919
4920         if (event->attr.read_format & PERF_FORMAT_GROUP)
4921                 perf_output_read_group(handle, event, enabled, running);
4922         else
4923                 perf_output_read_one(handle, event, enabled, running);
4924 }
4925
4926 void perf_output_sample(struct perf_output_handle *handle,
4927                         struct perf_event_header *header,
4928                         struct perf_sample_data *data,
4929                         struct perf_event *event)
4930 {
4931         u64 sample_type = data->type;
4932
4933         perf_output_put(handle, *header);
4934
4935         if (sample_type & PERF_SAMPLE_IDENTIFIER)
4936                 perf_output_put(handle, data->id);
4937
4938         if (sample_type & PERF_SAMPLE_IP)
4939                 perf_output_put(handle, data->ip);
4940
4941         if (sample_type & PERF_SAMPLE_TID)
4942                 perf_output_put(handle, data->tid_entry);
4943
4944         if (sample_type & PERF_SAMPLE_TIME)
4945                 perf_output_put(handle, data->time);
4946
4947         if (sample_type & PERF_SAMPLE_ADDR)
4948                 perf_output_put(handle, data->addr);
4949
4950         if (sample_type & PERF_SAMPLE_ID)
4951                 perf_output_put(handle, data->id);
4952
4953         if (sample_type & PERF_SAMPLE_STREAM_ID)
4954                 perf_output_put(handle, data->stream_id);
4955
4956         if (sample_type & PERF_SAMPLE_CPU)
4957                 perf_output_put(handle, data->cpu_entry);
4958
4959         if (sample_type & PERF_SAMPLE_PERIOD)
4960                 perf_output_put(handle, data->period);
4961
4962         if (sample_type & PERF_SAMPLE_READ)
4963                 perf_output_read(handle, event);
4964
4965         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4966                 if (data->callchain) {
4967                         int size = 1;
4968
4969                         if (data->callchain)
4970                                 size += data->callchain->nr;
4971
4972                         size *= sizeof(u64);
4973
4974                         __output_copy(handle, data->callchain, size);
4975                 } else {
4976                         u64 nr = 0;
4977                         perf_output_put(handle, nr);
4978                 }
4979         }
4980
4981         if (sample_type & PERF_SAMPLE_RAW) {
4982                 if (data->raw) {
4983                         perf_output_put(handle, data->raw->size);
4984                         __output_copy(handle, data->raw->data,
4985                                            data->raw->size);
4986                 } else {
4987                         struct {
4988                                 u32     size;
4989                                 u32     data;
4990                         } raw = {
4991                                 .size = sizeof(u32),
4992                                 .data = 0,
4993                         };
4994                         perf_output_put(handle, raw);
4995                 }
4996         }
4997
4998         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4999                 if (data->br_stack) {
5000                         size_t size;
5001
5002                         size = data->br_stack->nr
5003                              * sizeof(struct perf_branch_entry);
5004
5005                         perf_output_put(handle, data->br_stack->nr);
5006                         perf_output_copy(handle, data->br_stack->entries, size);
5007                 } else {
5008                         /*
5009                          * we always store at least the value of nr
5010                          */
5011                         u64 nr = 0;
5012                         perf_output_put(handle, nr);
5013                 }
5014         }
5015
5016         if (sample_type & PERF_SAMPLE_REGS_USER) {
5017                 u64 abi = data->regs_user.abi;
5018
5019                 /*
5020                  * If there are no regs to dump, notice it through
5021                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5022                  */
5023                 perf_output_put(handle, abi);
5024
5025                 if (abi) {
5026                         u64 mask = event->attr.sample_regs_user;
5027                         perf_output_sample_regs(handle,
5028                                                 data->regs_user.regs,
5029                                                 mask);
5030                 }
5031         }
5032
5033         if (sample_type & PERF_SAMPLE_STACK_USER) {
5034                 perf_output_sample_ustack(handle,
5035                                           data->stack_user_size,
5036                                           data->regs_user.regs);
5037         }
5038
5039         if (sample_type & PERF_SAMPLE_WEIGHT)
5040                 perf_output_put(handle, data->weight);
5041
5042         if (sample_type & PERF_SAMPLE_DATA_SRC)
5043                 perf_output_put(handle, data->data_src.val);
5044
5045         if (sample_type & PERF_SAMPLE_TRANSACTION)
5046                 perf_output_put(handle, data->txn);
5047
5048         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5049                 u64 abi = data->regs_intr.abi;
5050                 /*
5051                  * If there are no regs to dump, notice it through
5052                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5053                  */
5054                 perf_output_put(handle, abi);
5055
5056                 if (abi) {
5057                         u64 mask = event->attr.sample_regs_intr;
5058
5059                         perf_output_sample_regs(handle,
5060                                                 data->regs_intr.regs,
5061                                                 mask);
5062                 }
5063         }
5064
5065         if (!event->attr.watermark) {
5066                 int wakeup_events = event->attr.wakeup_events;
5067
5068                 if (wakeup_events) {
5069                         struct ring_buffer *rb = handle->rb;
5070                         int events = local_inc_return(&rb->events);
5071
5072                         if (events >= wakeup_events) {
5073                                 local_sub(wakeup_events, &rb->events);
5074                                 local_inc(&rb->wakeup);
5075                         }
5076                 }
5077         }
5078 }
5079
5080 void perf_prepare_sample(struct perf_event_header *header,
5081                          struct perf_sample_data *data,
5082                          struct perf_event *event,
5083                          struct pt_regs *regs)
5084 {
5085         u64 sample_type = event->attr.sample_type;
5086
5087         header->type = PERF_RECORD_SAMPLE;
5088         header->size = sizeof(*header) + event->header_size;
5089
5090         header->misc = 0;
5091         header->misc |= perf_misc_flags(regs);
5092
5093         __perf_event_header__init_id(header, data, event);
5094
5095         if (sample_type & PERF_SAMPLE_IP)
5096                 data->ip = perf_instruction_pointer(regs);
5097
5098         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5099                 int size = 1;
5100
5101                 data->callchain = perf_callchain(event, regs);
5102
5103                 if (data->callchain)
5104                         size += data->callchain->nr;
5105
5106                 header->size += size * sizeof(u64);
5107         }
5108
5109         if (sample_type & PERF_SAMPLE_RAW) {
5110                 int size = sizeof(u32);
5111
5112                 if (data->raw)
5113                         size += data->raw->size;
5114                 else
5115                         size += sizeof(u32);
5116
5117                 WARN_ON_ONCE(size & (sizeof(u64)-1));
5118                 header->size += size;
5119         }
5120
5121         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5122                 int size = sizeof(u64); /* nr */
5123                 if (data->br_stack) {
5124                         size += data->br_stack->nr
5125                               * sizeof(struct perf_branch_entry);
5126                 }
5127                 header->size += size;
5128         }
5129
5130         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5131                 perf_sample_regs_user(&data->regs_user, regs,
5132                                       &data->regs_user_copy);
5133
5134         if (sample_type & PERF_SAMPLE_REGS_USER) {
5135                 /* regs dump ABI info */
5136                 int size = sizeof(u64);
5137
5138                 if (data->regs_user.regs) {
5139                         u64 mask = event->attr.sample_regs_user;
5140                         size += hweight64(mask) * sizeof(u64);
5141                 }
5142
5143                 header->size += size;
5144         }
5145
5146         if (sample_type & PERF_SAMPLE_STACK_USER) {
5147                 /*
5148                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5149                  * processed as the last one or have additional check added
5150                  * in case new sample type is added, because we could eat
5151                  * up the rest of the sample size.
5152                  */
5153                 u16 stack_size = event->attr.sample_stack_user;
5154                 u16 size = sizeof(u64);
5155
5156                 stack_size = perf_sample_ustack_size(stack_size, header->size,
5157                                                      data->regs_user.regs);
5158
5159                 /*
5160                  * If there is something to dump, add space for the dump
5161                  * itself and for the field that tells the dynamic size,
5162                  * which is how many have been actually dumped.
5163                  */
5164                 if (stack_size)
5165                         size += sizeof(u64) + stack_size;
5166
5167                 data->stack_user_size = stack_size;
5168                 header->size += size;
5169         }
5170
5171         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5172                 /* regs dump ABI info */
5173                 int size = sizeof(u64);
5174
5175                 perf_sample_regs_intr(&data->regs_intr, regs);
5176
5177                 if (data->regs_intr.regs) {
5178                         u64 mask = event->attr.sample_regs_intr;
5179
5180                         size += hweight64(mask) * sizeof(u64);
5181                 }
5182
5183                 header->size += size;
5184         }
5185 }
5186
5187 static void perf_event_output(struct perf_event *event,
5188                                 struct perf_sample_data *data,
5189                                 struct pt_regs *regs)
5190 {
5191         struct perf_output_handle handle;
5192         struct perf_event_header header;
5193
5194         /* protect the callchain buffers */
5195         rcu_read_lock();
5196
5197         perf_prepare_sample(&header, data, event, regs);
5198
5199         if (perf_output_begin(&handle, event, header.size))
5200                 goto exit;
5201
5202         perf_output_sample(&handle, &header, data, event);
5203
5204         perf_output_end(&handle);
5205
5206 exit:
5207         rcu_read_unlock();
5208 }
5209
5210 /*
5211  * read event_id
5212  */
5213
5214 struct perf_read_event {
5215         struct perf_event_header        header;
5216
5217         u32                             pid;
5218         u32                             tid;
5219 };
5220
5221 static void
5222 perf_event_read_event(struct perf_event *event,
5223                         struct task_struct *task)
5224 {
5225         struct perf_output_handle handle;
5226         struct perf_sample_data sample;
5227         struct perf_read_event read_event = {
5228                 .header = {
5229                         .type = PERF_RECORD_READ,
5230                         .misc = 0,
5231                         .size = sizeof(read_event) + event->read_size,
5232                 },
5233                 .pid = perf_event_pid(event, task),
5234                 .tid = perf_event_tid(event, task),
5235         };
5236         int ret;
5237
5238         perf_event_header__init_id(&read_event.header, &sample, event);
5239         ret = perf_output_begin(&handle, event, read_event.header.size);
5240         if (ret)
5241                 return;
5242
5243         perf_output_put(&handle, read_event);
5244         perf_output_read(&handle, event);
5245         perf_event__output_id_sample(event, &handle, &sample);
5246
5247         perf_output_end(&handle);
5248 }
5249
5250 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5251
5252 static void
5253 perf_event_aux_ctx(struct perf_event_context *ctx,
5254                    perf_event_aux_output_cb output,
5255                    void *data)
5256 {
5257         struct perf_event *event;
5258
5259         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5260                 if (event->state < PERF_EVENT_STATE_INACTIVE)
5261                         continue;
5262                 if (!event_filter_match(event))
5263                         continue;
5264                 output(event, data);
5265         }
5266 }
5267
5268 static void
5269 perf_event_aux(perf_event_aux_output_cb output, void *data,
5270                struct perf_event_context *task_ctx)
5271 {
5272         struct perf_cpu_context *cpuctx;
5273         struct perf_event_context *ctx;
5274         struct pmu *pmu;
5275         int ctxn;
5276
5277         rcu_read_lock();
5278         list_for_each_entry_rcu(pmu, &pmus, entry) {
5279                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5280                 if (cpuctx->unique_pmu != pmu)
5281                         goto next;
5282                 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5283                 if (task_ctx)
5284                         goto next;
5285                 ctxn = pmu->task_ctx_nr;
5286                 if (ctxn < 0)
5287                         goto next;
5288                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5289                 if (ctx)
5290                         perf_event_aux_ctx(ctx, output, data);
5291 next:
5292                 put_cpu_ptr(pmu->pmu_cpu_context);
5293         }
5294
5295         if (task_ctx) {
5296                 preempt_disable();
5297                 perf_event_aux_ctx(task_ctx, output, data);
5298                 preempt_enable();
5299         }
5300         rcu_read_unlock();
5301 }
5302
5303 /*
5304  * task tracking -- fork/exit
5305  *
5306  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
5307  */
5308
5309 struct perf_task_event {
5310         struct task_struct              *task;
5311         struct perf_event_context       *task_ctx;
5312
5313         struct {
5314                 struct perf_event_header        header;
5315
5316                 u32                             pid;
5317                 u32                             ppid;
5318                 u32                             tid;
5319                 u32                             ptid;
5320                 u64                             time;
5321         } event_id;
5322 };
5323
5324 static int perf_event_task_match(struct perf_event *event)
5325 {
5326         return event->attr.comm  || event->attr.mmap ||
5327                event->attr.mmap2 || event->attr.mmap_data ||
5328                event->attr.task;
5329 }
5330
5331 static void perf_event_task_output(struct perf_event *event,
5332                                    void *data)
5333 {
5334         struct perf_task_event *task_event = data;
5335         struct perf_output_handle handle;
5336         struct perf_sample_data sample;
5337         struct task_struct *task = task_event->task;
5338         int ret, size = task_event->event_id.header.size;
5339
5340         if (!perf_event_task_match(event))
5341                 return;
5342
5343         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5344
5345         ret = perf_output_begin(&handle, event,
5346                                 task_event->event_id.header.size);
5347         if (ret)
5348                 goto out;
5349
5350         task_event->event_id.pid = perf_event_pid(event, task);
5351         task_event->event_id.ppid = perf_event_pid(event, current);
5352
5353         task_event->event_id.tid = perf_event_tid(event, task);
5354         task_event->event_id.ptid = perf_event_tid(event, current);
5355
5356         perf_output_put(&handle, task_event->event_id);
5357
5358         perf_event__output_id_sample(event, &handle, &sample);
5359
5360         perf_output_end(&handle);
5361 out:
5362         task_event->event_id.header.size = size;
5363 }
5364
5365 static void perf_event_task(struct task_struct *task,
5366                               struct perf_event_context *task_ctx,
5367                               int new)
5368 {
5369         struct perf_task_event task_event;
5370
5371         if (!atomic_read(&nr_comm_events) &&
5372             !atomic_read(&nr_mmap_events) &&
5373             !atomic_read(&nr_task_events))
5374                 return;
5375
5376         task_event = (struct perf_task_event){
5377                 .task     = task,
5378                 .task_ctx = task_ctx,
5379                 .event_id    = {
5380                         .header = {
5381                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5382                                 .misc = 0,
5383                                 .size = sizeof(task_event.event_id),
5384                         },
5385                         /* .pid  */
5386                         /* .ppid */
5387                         /* .tid  */
5388                         /* .ptid */
5389                         .time = perf_clock(),
5390                 },
5391         };
5392
5393         perf_event_aux(perf_event_task_output,
5394                        &task_event,
5395                        task_ctx);
5396 }
5397
5398 void perf_event_fork(struct task_struct *task)
5399 {
5400         perf_event_task(task, NULL, 1);
5401 }
5402
5403 /*
5404  * comm tracking
5405  */
5406
5407 struct perf_comm_event {
5408         struct task_struct      *task;
5409         char                    *comm;
5410         int                     comm_size;
5411
5412         struct {
5413                 struct perf_event_header        header;
5414
5415                 u32                             pid;
5416                 u32                             tid;
5417         } event_id;
5418 };
5419
5420 static int perf_event_comm_match(struct perf_event *event)
5421 {
5422         return event->attr.comm;
5423 }
5424
5425 static void perf_event_comm_output(struct perf_event *event,
5426                                    void *data)
5427 {
5428         struct perf_comm_event *comm_event = data;
5429         struct perf_output_handle handle;
5430         struct perf_sample_data sample;
5431         int size = comm_event->event_id.header.size;
5432         int ret;
5433
5434         if (!perf_event_comm_match(event))
5435                 return;
5436
5437         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5438         ret = perf_output_begin(&handle, event,
5439                                 comm_event->event_id.header.size);
5440
5441         if (ret)
5442                 goto out;
5443
5444         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5445         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5446
5447         perf_output_put(&handle, comm_event->event_id);
5448         __output_copy(&handle, comm_event->comm,
5449                                    comm_event->comm_size);
5450
5451         perf_event__output_id_sample(event, &handle, &sample);
5452
5453         perf_output_end(&handle);
5454 out:
5455         comm_event->event_id.header.size = size;
5456 }
5457
5458 static void perf_event_comm_event(struct perf_comm_event *comm_event)
5459 {
5460         char comm[TASK_COMM_LEN];
5461         unsigned int size;
5462
5463         memset(comm, 0, sizeof(comm));
5464         strlcpy(comm, comm_event->task->comm, sizeof(comm));
5465         size = ALIGN(strlen(comm)+1, sizeof(u64));
5466
5467         comm_event->comm = comm;
5468         comm_event->comm_size = size;
5469
5470         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5471
5472         perf_event_aux(perf_event_comm_output,
5473                        comm_event,
5474                        NULL);
5475 }
5476
5477 void perf_event_comm(struct task_struct *task, bool exec)
5478 {
5479         struct perf_comm_event comm_event;
5480
5481         if (!atomic_read(&nr_comm_events))
5482                 return;
5483
5484         comm_event = (struct perf_comm_event){
5485                 .task   = task,
5486                 /* .comm      */
5487                 /* .comm_size */
5488                 .event_id  = {
5489                         .header = {
5490                                 .type = PERF_RECORD_COMM,
5491                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5492                                 /* .size */
5493                         },
5494                         /* .pid */
5495                         /* .tid */
5496                 },
5497         };
5498
5499         perf_event_comm_event(&comm_event);
5500 }
5501
5502 /*
5503  * mmap tracking
5504  */
5505
5506 struct perf_mmap_event {
5507         struct vm_area_struct   *vma;
5508
5509         const char              *file_name;
5510         int                     file_size;
5511         int                     maj, min;
5512         u64                     ino;
5513         u64                     ino_generation;
5514         u32                     prot, flags;
5515
5516         struct {
5517                 struct perf_event_header        header;
5518
5519                 u32                             pid;
5520                 u32                             tid;
5521                 u64                             start;
5522                 u64                             len;
5523                 u64                             pgoff;
5524         } event_id;
5525 };
5526
5527 static int perf_event_mmap_match(struct perf_event *event,
5528                                  void *data)
5529 {
5530         struct perf_mmap_event *mmap_event = data;
5531         struct vm_area_struct *vma = mmap_event->vma;
5532         int executable = vma->vm_flags & VM_EXEC;
5533
5534         return (!executable && event->attr.mmap_data) ||
5535                (executable && (event->attr.mmap || event->attr.mmap2));
5536 }
5537
5538 static void perf_event_mmap_output(struct perf_event *event,
5539                                    void *data)
5540 {
5541         struct perf_mmap_event *mmap_event = data;
5542         struct perf_output_handle handle;
5543         struct perf_sample_data sample;
5544         int size = mmap_event->event_id.header.size;
5545         int ret;
5546
5547         if (!perf_event_mmap_match(event, data))
5548                 return;
5549
5550         if (event->attr.mmap2) {
5551                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5552                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5553                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5554                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5555                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5556                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5557                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5558         }
5559
5560         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5561         ret = perf_output_begin(&handle, event,
5562                                 mmap_event->event_id.header.size);
5563         if (ret)
5564                 goto out;
5565
5566         mmap_event->event_id.pid = perf_event_pid(event, current);
5567         mmap_event->event_id.tid = perf_event_tid(event, current);
5568
5569         perf_output_put(&handle, mmap_event->event_id);
5570
5571         if (event->attr.mmap2) {
5572                 perf_output_put(&handle, mmap_event->maj);
5573                 perf_output_put(&handle, mmap_event->min);
5574                 perf_output_put(&handle, mmap_event->ino);
5575                 perf_output_put(&handle, mmap_event->ino_generation);
5576                 perf_output_put(&handle, mmap_event->prot);
5577                 perf_output_put(&handle, mmap_event->flags);
5578         }
5579
5580         __output_copy(&handle, mmap_event->file_name,
5581                                    mmap_event->file_size);
5582
5583         perf_event__output_id_sample(event, &handle, &sample);
5584
5585         perf_output_end(&handle);
5586 out:
5587         mmap_event->event_id.header.size = size;
5588 }
5589
5590 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5591 {
5592         struct vm_area_struct *vma = mmap_event->vma;
5593         struct file *file = vma->vm_file;
5594         int maj = 0, min = 0;
5595         u64 ino = 0, gen = 0;
5596         u32 prot = 0, flags = 0;
5597         unsigned int size;
5598         char tmp[16];
5599         char *buf = NULL;
5600         char *name;
5601
5602         if (file) {
5603                 struct inode *inode;
5604                 dev_t dev;
5605
5606                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5607                 if (!buf) {
5608                         name = "//enomem";
5609                         goto cpy_name;
5610                 }
5611                 /*
5612                  * d_path() works from the end of the rb backwards, so we
5613                  * need to add enough zero bytes after the string to handle
5614                  * the 64bit alignment we do later.
5615                  */
5616                 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5617                 if (IS_ERR(name)) {
5618                         name = "//toolong";
5619                         goto cpy_name;
5620                 }
5621                 inode = file_inode(vma->vm_file);
5622                 dev = inode->i_sb->s_dev;
5623                 ino = inode->i_ino;
5624                 gen = inode->i_generation;
5625                 maj = MAJOR(dev);
5626                 min = MINOR(dev);
5627
5628                 if (vma->vm_flags & VM_READ)
5629                         prot |= PROT_READ;
5630                 if (vma->vm_flags & VM_WRITE)
5631                         prot |= PROT_WRITE;
5632                 if (vma->vm_flags & VM_EXEC)
5633                         prot |= PROT_EXEC;
5634
5635                 if (vma->vm_flags & VM_MAYSHARE)
5636                         flags = MAP_SHARED;
5637                 else
5638                         flags = MAP_PRIVATE;
5639
5640                 if (vma->vm_flags & VM_DENYWRITE)
5641                         flags |= MAP_DENYWRITE;
5642                 if (vma->vm_flags & VM_MAYEXEC)
5643                         flags |= MAP_EXECUTABLE;
5644                 if (vma->vm_flags & VM_LOCKED)
5645                         flags |= MAP_LOCKED;
5646                 if (vma->vm_flags & VM_HUGETLB)
5647                         flags |= MAP_HUGETLB;
5648
5649                 goto got_name;
5650         } else {
5651                 if (vma->vm_ops && vma->vm_ops->name) {
5652                         name = (char *) vma->vm_ops->name(vma);
5653                         if (name)
5654                                 goto cpy_name;
5655                 }
5656
5657                 name = (char *)arch_vma_name(vma);
5658                 if (name)
5659                         goto cpy_name;
5660
5661                 if (vma->vm_start <= vma->vm_mm->start_brk &&
5662                                 vma->vm_end >= vma->vm_mm->brk) {
5663                         name = "[heap]";
5664                         goto cpy_name;
5665                 }
5666                 if (vma->vm_start <= vma->vm_mm->start_stack &&
5667                                 vma->vm_end >= vma->vm_mm->start_stack) {
5668                         name = "[stack]";
5669                         goto cpy_name;
5670                 }
5671
5672                 name = "//anon";
5673                 goto cpy_name;
5674         }
5675
5676 cpy_name:
5677         strlcpy(tmp, name, sizeof(tmp));
5678         name = tmp;
5679 got_name:
5680         /*
5681          * Since our buffer works in 8 byte units we need to align our string
5682          * size to a multiple of 8. However, we must guarantee the tail end is
5683          * zero'd out to avoid leaking random bits to userspace.
5684          */
5685         size = strlen(name)+1;
5686         while (!IS_ALIGNED(size, sizeof(u64)))
5687                 name[size++] = '\0';
5688
5689         mmap_event->file_name = name;
5690         mmap_event->file_size = size;
5691         mmap_event->maj = maj;
5692         mmap_event->min = min;
5693         mmap_event->ino = ino;
5694         mmap_event->ino_generation = gen;
5695         mmap_event->prot = prot;
5696         mmap_event->flags = flags;
5697
5698         if (!(vma->vm_flags & VM_EXEC))
5699                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5700
5701         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5702
5703         perf_event_aux(perf_event_mmap_output,
5704                        mmap_event,
5705                        NULL);
5706
5707         kfree(buf);
5708 }
5709
5710 void perf_event_mmap(struct vm_area_struct *vma)
5711 {
5712         struct perf_mmap_event mmap_event;
5713
5714         if (!atomic_read(&nr_mmap_events))
5715                 return;
5716
5717         mmap_event = (struct perf_mmap_event){
5718                 .vma    = vma,
5719                 /* .file_name */
5720                 /* .file_size */
5721                 .event_id  = {
5722                         .header = {
5723                                 .type = PERF_RECORD_MMAP,
5724                                 .misc = PERF_RECORD_MISC_USER,
5725                                 /* .size */
5726                         },
5727                         /* .pid */
5728                         /* .tid */
5729                         .start  = vma->vm_start,
5730                         .len    = vma->vm_end - vma->vm_start,
5731                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
5732                 },
5733                 /* .maj (attr_mmap2 only) */
5734                 /* .min (attr_mmap2 only) */
5735                 /* .ino (attr_mmap2 only) */
5736                 /* .ino_generation (attr_mmap2 only) */
5737                 /* .prot (attr_mmap2 only) */
5738                 /* .flags (attr_mmap2 only) */
5739         };
5740
5741         perf_event_mmap_event(&mmap_event);
5742 }
5743
5744 /*
5745  * IRQ throttle logging
5746  */
5747
5748 static void perf_log_throttle(struct perf_event *event, int enable)
5749 {
5750         struct perf_output_handle handle;
5751         struct perf_sample_data sample;
5752         int ret;
5753
5754         struct {
5755                 struct perf_event_header        header;
5756                 u64                             time;
5757                 u64                             id;
5758                 u64                             stream_id;
5759         } throttle_event = {
5760                 .header = {
5761                         .type = PERF_RECORD_THROTTLE,
5762                         .misc = 0,
5763                         .size = sizeof(throttle_event),
5764                 },
5765                 .time           = perf_clock(),
5766                 .id             = primary_event_id(event),
5767                 .stream_id      = event->id,
5768         };
5769
5770         if (enable)
5771                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
5772
5773         perf_event_header__init_id(&throttle_event.header, &sample, event);
5774
5775         ret = perf_output_begin(&handle, event,
5776                                 throttle_event.header.size);
5777         if (ret)
5778                 return;
5779
5780         perf_output_put(&handle, throttle_event);
5781         perf_event__output_id_sample(event, &handle, &sample);
5782         perf_output_end(&handle);
5783 }
5784
5785 /*
5786  * Generic event overflow handling, sampling.
5787  */
5788
5789 static int __perf_event_overflow(struct perf_event *event,
5790                                    int throttle, struct perf_sample_data *data,
5791                                    struct pt_regs *regs)
5792 {
5793         int events = atomic_read(&event->event_limit);
5794         struct hw_perf_event *hwc = &event->hw;
5795         u64 seq;
5796         int ret = 0;
5797
5798         /*
5799          * Non-sampling counters might still use the PMI to fold short
5800          * hardware counters, ignore those.
5801          */
5802         if (unlikely(!is_sampling_event(event)))
5803                 return 0;
5804
5805         seq = __this_cpu_read(perf_throttled_seq);
5806         if (seq != hwc->interrupts_seq) {
5807                 hwc->interrupts_seq = seq;
5808                 hwc->interrupts = 1;
5809         } else {
5810                 hwc->interrupts++;
5811                 if (unlikely(throttle
5812                              && hwc->interrupts >= max_samples_per_tick)) {
5813                         __this_cpu_inc(perf_throttled_count);
5814                         hwc->interrupts = MAX_INTERRUPTS;
5815                         perf_log_throttle(event, 0);
5816                         tick_nohz_full_kick();
5817                         ret = 1;
5818                 }
5819         }
5820
5821         if (event->attr.freq) {
5822                 u64 now = perf_clock();
5823                 s64 delta = now - hwc->freq_time_stamp;
5824
5825                 hwc->freq_time_stamp = now;
5826
5827                 if (delta > 0 && delta < 2*TICK_NSEC)
5828                         perf_adjust_period(event, delta, hwc->last_period, true);
5829         }
5830
5831         /*
5832          * XXX event_limit might not quite work as expected on inherited
5833          * events
5834          */
5835
5836         event->pending_kill = POLL_IN;
5837         if (events && atomic_dec_and_test(&event->event_limit)) {
5838                 ret = 1;
5839                 event->pending_kill = POLL_HUP;
5840                 event->pending_disable = 1;
5841                 irq_work_queue(&event->pending);
5842         }
5843
5844         if (event->overflow_handler)
5845                 event->overflow_handler(event, data, regs);
5846         else
5847                 perf_event_output(event, data, regs);
5848
5849         if (event->fasync && event->pending_kill) {
5850                 event->pending_wakeup = 1;
5851                 irq_work_queue(&event->pending);
5852         }
5853
5854         return ret;
5855 }
5856
5857 int perf_event_overflow(struct perf_event *event,
5858                           struct perf_sample_data *data,
5859                           struct pt_regs *regs)
5860 {
5861         return __perf_event_overflow(event, 1, data, regs);
5862 }
5863
5864 /*
5865  * Generic software event infrastructure
5866  */
5867
5868 struct swevent_htable {
5869         struct swevent_hlist            *swevent_hlist;
5870         struct mutex                    hlist_mutex;
5871         int                             hlist_refcount;
5872
5873         /* Recursion avoidance in each contexts */
5874         int                             recursion[PERF_NR_CONTEXTS];
5875
5876         /* Keeps track of cpu being initialized/exited */
5877         bool                            online;
5878 };
5879
5880 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5881
5882 /*
5883  * We directly increment event->count and keep a second value in
5884  * event->hw.period_left to count intervals. This period event
5885  * is kept in the range [-sample_period, 0] so that we can use the
5886  * sign as trigger.
5887  */
5888
5889 u64 perf_swevent_set_period(struct perf_event *event)
5890 {
5891         struct hw_perf_event *hwc = &event->hw;
5892         u64 period = hwc->last_period;
5893         u64 nr, offset;
5894         s64 old, val;
5895
5896         hwc->last_period = hwc->sample_period;
5897
5898 again:
5899         old = val = local64_read(&hwc->period_left);
5900         if (val < 0)
5901                 return 0;
5902
5903         nr = div64_u64(period + val, period);
5904         offset = nr * period;
5905         val -= offset;
5906         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
5907                 goto again;
5908
5909         return nr;
5910 }
5911
5912 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5913                                     struct perf_sample_data *data,
5914                                     struct pt_regs *regs)
5915 {
5916         struct hw_perf_event *hwc = &event->hw;
5917         int throttle = 0;
5918
5919         if (!overflow)
5920                 overflow = perf_swevent_set_period(event);
5921
5922         if (hwc->interrupts == MAX_INTERRUPTS)
5923                 return;
5924
5925         for (; overflow; overflow--) {
5926                 if (__perf_event_overflow(event, throttle,
5927                                             data, regs)) {
5928                         /*
5929                          * We inhibit the overflow from happening when
5930                          * hwc->interrupts == MAX_INTERRUPTS.
5931                          */
5932                         break;
5933                 }
5934                 throttle = 1;
5935         }
5936 }
5937
5938 static void perf_swevent_event(struct perf_event *event, u64 nr,
5939                                struct perf_sample_data *data,
5940                                struct pt_regs *regs)
5941 {
5942         struct hw_perf_event *hwc = &event->hw;
5943
5944         local64_add(nr, &event->count);
5945
5946         if (!regs)
5947                 return;
5948
5949         if (!is_sampling_event(event))
5950                 return;
5951
5952         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
5953                 data->period = nr;
5954                 return perf_swevent_overflow(event, 1, data, regs);
5955         } else
5956                 data->period = event->hw.last_period;
5957
5958         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5959                 return perf_swevent_overflow(event, 1, data, regs);
5960
5961         if (local64_add_negative(nr, &hwc->period_left))
5962                 return;
5963
5964         perf_swevent_overflow(event, 0, data, regs);
5965 }
5966
5967 static int perf_exclude_event(struct perf_event *event,
5968                               struct pt_regs *regs)
5969 {
5970         if (event->hw.state & PERF_HES_STOPPED)
5971                 return 1;
5972
5973         if (regs) {
5974                 if (event->attr.exclude_user && user_mode(regs))
5975                         return 1;
5976
5977                 if (event->attr.exclude_kernel && !user_mode(regs))
5978                         return 1;
5979         }
5980
5981         return 0;
5982 }
5983
5984 static int perf_swevent_match(struct perf_event *event,
5985                                 enum perf_type_id type,
5986                                 u32 event_id,
5987                                 struct perf_sample_data *data,
5988                                 struct pt_regs *regs)
5989 {
5990         if (event->attr.type != type)
5991                 return 0;
5992
5993         if (event->attr.config != event_id)
5994                 return 0;
5995
5996         if (perf_exclude_event(event, regs))
5997                 return 0;
5998
5999         return 1;
6000 }
6001
6002 static inline u64 swevent_hash(u64 type, u32 event_id)
6003 {
6004         u64 val = event_id | (type << 32);
6005
6006         return hash_64(val, SWEVENT_HLIST_BITS);
6007 }
6008
6009 static inline struct hlist_head *
6010 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
6011 {
6012         u64 hash = swevent_hash(type, event_id);
6013
6014         return &hlist->heads[hash];
6015 }
6016
6017 /* For the read side: events when they trigger */
6018 static inline struct hlist_head *
6019 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6020 {
6021         struct swevent_hlist *hlist;
6022
6023         hlist = rcu_dereference(swhash->swevent_hlist);
6024         if (!hlist)
6025                 return NULL;
6026
6027         return __find_swevent_head(hlist, type, event_id);
6028 }
6029
6030 /* For the event head insertion and removal in the hlist */
6031 static inline struct hlist_head *
6032 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6033 {
6034         struct swevent_hlist *hlist;
6035         u32 event_id = event->attr.config;
6036         u64 type = event->attr.type;
6037
6038         /*
6039          * Event scheduling is always serialized against hlist allocation
6040          * and release. Which makes the protected version suitable here.
6041          * The context lock guarantees that.
6042          */
6043         hlist = rcu_dereference_protected(swhash->swevent_hlist,
6044                                           lockdep_is_held(&event->ctx->lock));
6045         if (!hlist)
6046                 return NULL;
6047
6048         return __find_swevent_head(hlist, type, event_id);
6049 }
6050
6051 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6052                                     u64 nr,
6053                                     struct perf_sample_data *data,
6054                                     struct pt_regs *regs)
6055 {
6056         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6057         struct perf_event *event;
6058         struct hlist_head *head;
6059
6060         rcu_read_lock();
6061         head = find_swevent_head_rcu(swhash, type, event_id);
6062         if (!head)
6063                 goto end;
6064
6065         hlist_for_each_entry_rcu(event, head, hlist_entry) {
6066                 if (perf_swevent_match(event, type, event_id, data, regs))
6067                         perf_swevent_event(event, nr, data, regs);
6068         }
6069 end:
6070         rcu_read_unlock();
6071 }
6072
6073 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6074
6075 int perf_swevent_get_recursion_context(void)
6076 {
6077         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6078
6079         return get_recursion_context(swhash->recursion);
6080 }
6081 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6082
6083 inline void perf_swevent_put_recursion_context(int rctx)
6084 {
6085         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6086
6087         put_recursion_context(swhash->recursion, rctx);
6088 }
6089
6090 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6091 {
6092         struct perf_sample_data data;
6093
6094         if (WARN_ON_ONCE(!regs))
6095                 return;
6096
6097         perf_sample_data_init(&data, addr, 0);
6098         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6099 }
6100
6101 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6102 {
6103         int rctx;
6104
6105         preempt_disable_notrace();
6106         rctx = perf_swevent_get_recursion_context();
6107         if (unlikely(rctx < 0))
6108                 goto fail;
6109
6110         ___perf_sw_event(event_id, nr, regs, addr);
6111
6112         perf_swevent_put_recursion_context(rctx);
6113 fail:
6114         preempt_enable_notrace();
6115 }
6116
6117 static void perf_swevent_read(struct perf_event *event)
6118 {
6119 }
6120
6121 static int perf_swevent_add(struct perf_event *event, int flags)
6122 {
6123         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6124         struct hw_perf_event *hwc = &event->hw;
6125         struct hlist_head *head;
6126
6127         if (is_sampling_event(event)) {
6128                 hwc->last_period = hwc->sample_period;
6129                 perf_swevent_set_period(event);
6130         }
6131
6132         hwc->state = !(flags & PERF_EF_START);
6133
6134         head = find_swevent_head(swhash, event);
6135         if (!head) {
6136                 /*
6137                  * We can race with cpu hotplug code. Do not
6138                  * WARN if the cpu just got unplugged.
6139                  */
6140                 WARN_ON_ONCE(swhash->online);
6141                 return -EINVAL;
6142         }
6143
6144         hlist_add_head_rcu(&event->hlist_entry, head);
6145         perf_event_update_userpage(event);
6146
6147         return 0;
6148 }
6149
6150 static void perf_swevent_del(struct perf_event *event, int flags)
6151 {
6152         hlist_del_rcu(&event->hlist_entry);
6153 }
6154
6155 static void perf_swevent_start(struct perf_event *event, int flags)
6156 {
6157         event->hw.state = 0;
6158 }
6159
6160 static void perf_swevent_stop(struct perf_event *event, int flags)
6161 {
6162         event->hw.state = PERF_HES_STOPPED;
6163 }
6164
6165 /* Deref the hlist from the update side */
6166 static inline struct swevent_hlist *
6167 swevent_hlist_deref(struct swevent_htable *swhash)
6168 {
6169         return rcu_dereference_protected(swhash->swevent_hlist,
6170                                          lockdep_is_held(&swhash->hlist_mutex));
6171 }
6172
6173 static void swevent_hlist_release(struct swevent_htable *swhash)
6174 {
6175         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6176
6177         if (!hlist)
6178                 return;
6179
6180         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6181         kfree_rcu(hlist, rcu_head);
6182 }
6183
6184 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6185 {
6186         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6187
6188         mutex_lock(&swhash->hlist_mutex);
6189
6190         if (!--swhash->hlist_refcount)
6191                 swevent_hlist_release(swhash);
6192
6193         mutex_unlock(&swhash->hlist_mutex);
6194 }
6195
6196 static void swevent_hlist_put(struct perf_event *event)
6197 {
6198         int cpu;
6199
6200         for_each_possible_cpu(cpu)
6201                 swevent_hlist_put_cpu(event, cpu);
6202 }
6203
6204 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6205 {
6206         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6207         int err = 0;
6208
6209         mutex_lock(&swhash->hlist_mutex);
6210
6211         if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6212                 struct swevent_hlist *hlist;
6213
6214                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6215                 if (!hlist) {
6216                         err = -ENOMEM;
6217                         goto exit;
6218                 }
6219                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6220         }
6221         swhash->hlist_refcount++;
6222 exit:
6223         mutex_unlock(&swhash->hlist_mutex);
6224
6225         return err;
6226 }
6227
6228 static int swevent_hlist_get(struct perf_event *event)
6229 {
6230         int err;
6231         int cpu, failed_cpu;
6232
6233         get_online_cpus();
6234         for_each_possible_cpu(cpu) {
6235                 err = swevent_hlist_get_cpu(event, cpu);
6236                 if (err) {
6237                         failed_cpu = cpu;
6238                         goto fail;
6239                 }
6240         }
6241         put_online_cpus();
6242
6243         return 0;
6244 fail:
6245         for_each_possible_cpu(cpu) {
6246                 if (cpu == failed_cpu)
6247                         break;
6248                 swevent_hlist_put_cpu(event, cpu);
6249         }
6250
6251         put_online_cpus();
6252         return err;
6253 }
6254
6255 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6256
6257 static void sw_perf_event_destroy(struct perf_event *event)
6258 {
6259         u64 event_id = event->attr.config;
6260
6261         WARN_ON(event->parent);
6262
6263         static_key_slow_dec(&perf_swevent_enabled[event_id]);
6264         swevent_hlist_put(event);
6265 }
6266
6267 static int perf_swevent_init(struct perf_event *event)
6268 {
6269         u64 event_id = event->attr.config;
6270
6271         if (event->attr.type != PERF_TYPE_SOFTWARE)
6272                 return -ENOENT;
6273
6274         /*
6275          * no branch sampling for software events
6276          */
6277         if (has_branch_stack(event))
6278                 return -EOPNOTSUPP;
6279
6280         switch (event_id) {
6281         case PERF_COUNT_SW_CPU_CLOCK:
6282         case PERF_COUNT_SW_TASK_CLOCK:
6283                 return -ENOENT;
6284
6285         default:
6286                 break;
6287         }
6288
6289         if (event_id >= PERF_COUNT_SW_MAX)
6290                 return -ENOENT;
6291
6292         if (!event->parent) {
6293                 int err;
6294
6295                 err = swevent_hlist_get(event);
6296                 if (err)
6297                         return err;
6298
6299                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
6300                 event->destroy = sw_perf_event_destroy;
6301         }
6302
6303         return 0;
6304 }
6305
6306 static struct pmu perf_swevent = {
6307         .task_ctx_nr    = perf_sw_context,
6308
6309         .event_init     = perf_swevent_init,
6310         .add            = perf_swevent_add,
6311         .del            = perf_swevent_del,
6312         .start          = perf_swevent_start,
6313         .stop           = perf_swevent_stop,
6314         .read           = perf_swevent_read,
6315 };
6316
6317 #ifdef CONFIG_EVENT_TRACING
6318
6319 static int perf_tp_filter_match(struct perf_event *event,
6320                                 struct perf_sample_data *data)
6321 {
6322         void *record = data->raw->data;
6323
6324         if (likely(!event->filter) || filter_match_preds(event->filter, record))
6325                 return 1;
6326         return 0;
6327 }
6328
6329 static int perf_tp_event_match(struct perf_event *event,
6330                                 struct perf_sample_data *data,
6331                                 struct pt_regs *regs)
6332 {
6333         if (event->hw.state & PERF_HES_STOPPED)
6334                 return 0;
6335         /*
6336          * All tracepoints are from kernel-space.
6337          */
6338         if (event->attr.exclude_kernel)
6339                 return 0;
6340
6341         if (!perf_tp_filter_match(event, data))
6342                 return 0;
6343
6344         return 1;
6345 }
6346
6347 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6348                    struct pt_regs *regs, struct hlist_head *head, int rctx,
6349                    struct task_struct *task)
6350 {
6351         struct perf_sample_data data;
6352         struct perf_event *event;
6353
6354         struct perf_raw_record raw = {
6355                 .size = entry_size,
6356                 .data = record,
6357         };
6358
6359         perf_sample_data_init(&data, addr, 0);
6360         data.raw = &raw;
6361
6362         hlist_for_each_entry_rcu(event, head, hlist_entry) {
6363                 if (perf_tp_event_match(event, &data, regs))
6364                         perf_swevent_event(event, count, &data, regs);
6365         }
6366
6367         /*
6368          * If we got specified a target task, also iterate its context and
6369          * deliver this event there too.
6370          */
6371         if (task && task != current) {
6372                 struct perf_event_context *ctx;
6373                 struct trace_entry *entry = record;
6374
6375                 rcu_read_lock();
6376                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6377                 if (!ctx)
6378                         goto unlock;
6379
6380                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6381                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
6382                                 continue;
6383                         if (event->attr.config != entry->type)
6384                                 continue;
6385                         if (perf_tp_event_match(event, &data, regs))
6386                                 perf_swevent_event(event, count, &data, regs);
6387                 }
6388 unlock:
6389                 rcu_read_unlock();
6390         }
6391
6392         perf_swevent_put_recursion_context(rctx);
6393 }
6394 EXPORT_SYMBOL_GPL(perf_tp_event);
6395
6396 static void tp_perf_event_destroy(struct perf_event *event)
6397 {
6398         perf_trace_destroy(event);
6399 }
6400
6401 static int perf_tp_event_init(struct perf_event *event)
6402 {
6403         int err;
6404
6405         if (event->attr.type != PERF_TYPE_TRACEPOINT)
6406                 return -ENOENT;
6407
6408         /*
6409          * no branch sampling for tracepoint events
6410          */
6411         if (has_branch_stack(event))
6412                 return -EOPNOTSUPP;
6413
6414         err = perf_trace_init(event);
6415         if (err)
6416                 return err;
6417
6418         event->destroy = tp_perf_event_destroy;
6419
6420         return 0;
6421 }
6422
6423 static struct pmu perf_tracepoint = {
6424         .task_ctx_nr    = perf_sw_context,
6425
6426         .event_init     = perf_tp_event_init,
6427         .add            = perf_trace_add,
6428         .del            = perf_trace_del,
6429         .start          = perf_swevent_start,
6430         .stop           = perf_swevent_stop,
6431         .read           = perf_swevent_read,
6432 };
6433
6434 static inline void perf_tp_register(void)
6435 {
6436         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
6437 }
6438
6439 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6440 {
6441         char *filter_str;
6442         int ret;
6443
6444         if (event->attr.type != PERF_TYPE_TRACEPOINT)
6445                 return -EINVAL;
6446
6447         filter_str = strndup_user(arg, PAGE_SIZE);
6448         if (IS_ERR(filter_str))
6449                 return PTR_ERR(filter_str);
6450
6451         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
6452
6453         kfree(filter_str);
6454         return ret;
6455 }
6456
6457 static void perf_event_free_filter(struct perf_event *event)
6458 {
6459         ftrace_profile_free_filter(event);
6460 }
6461
6462 #else
6463
6464 static inline void perf_tp_register(void)
6465 {
6466 }
6467
6468 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6469 {
6470         return -ENOENT;
6471 }
6472
6473 static void perf_event_free_filter(struct perf_event *event)
6474 {
6475 }
6476
6477 #endif /* CONFIG_EVENT_TRACING */
6478
6479 #ifdef CONFIG_HAVE_HW_BREAKPOINT
6480 void perf_bp_event(struct perf_event *bp, void *data)
6481 {
6482         struct perf_sample_data sample;
6483         struct pt_regs *regs = data;
6484
6485         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
6486
6487         if (!bp->hw.state && !perf_exclude_event(bp, regs))
6488                 perf_swevent_event(bp, 1, &sample, regs);
6489 }
6490 #endif
6491
6492 /*
6493  * hrtimer based swevent callback
6494  */
6495
6496 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
6497 {
6498         enum hrtimer_restart ret = HRTIMER_RESTART;
6499         struct perf_sample_data data;
6500         struct pt_regs *regs;
6501         struct perf_event *event;
6502         u64 period;
6503
6504         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
6505
6506         if (event->state != PERF_EVENT_STATE_ACTIVE)
6507                 return HRTIMER_NORESTART;
6508
6509         event->pmu->read(event);
6510
6511         perf_sample_data_init(&data, 0, event->hw.last_period);
6512         regs = get_irq_regs();
6513
6514         if (regs && !perf_exclude_event(event, regs)) {
6515                 if (!(event->attr.exclude_idle && is_idle_task(current)))
6516                         if (__perf_event_overflow(event, 1, &data, regs))
6517                                 ret = HRTIMER_NORESTART;
6518         }
6519
6520         period = max_t(u64, 10000, event->hw.sample_period);
6521         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
6522
6523         return ret;
6524 }
6525
6526 static void perf_swevent_start_hrtimer(struct perf_event *event)
6527 {
6528         struct hw_perf_event *hwc = &event->hw;
6529         s64 period;
6530
6531         if (!is_sampling_event(event))
6532                 return;
6533
6534         period = local64_read(&hwc->period_left);
6535         if (period) {
6536                 if (period < 0)
6537                         period = 10000;
6538
6539                 local64_set(&hwc->period_left, 0);
6540         } else {
6541                 period = max_t(u64, 10000, hwc->sample_period);
6542         }
6543         __hrtimer_start_range_ns(&hwc->hrtimer,
6544                                 ns_to_ktime(period), 0,
6545                                 HRTIMER_MODE_REL_PINNED, 0);
6546 }
6547
6548 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
6549 {
6550         struct hw_perf_event *hwc = &event->hw;
6551
6552         if (is_sampling_event(event)) {
6553                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
6554                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
6555
6556                 hrtimer_cancel(&hwc->hrtimer);
6557         }
6558 }
6559
6560 static void perf_swevent_init_hrtimer(struct perf_event *event)
6561 {
6562         struct hw_perf_event *hwc = &event->hw;
6563
6564         if (!is_sampling_event(event))
6565                 return;
6566
6567         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6568         hwc->hrtimer.function = perf_swevent_hrtimer;
6569
6570         /*
6571          * Since hrtimers have a fixed rate, we can do a static freq->period
6572          * mapping and avoid the whole period adjust feedback stuff.
6573          */
6574         if (event->attr.freq) {
6575                 long freq = event->attr.sample_freq;
6576
6577                 event->attr.sample_period = NSEC_PER_SEC / freq;
6578                 hwc->sample_period = event->attr.sample_period;
6579                 local64_set(&hwc->period_left, hwc->sample_period);
6580                 hwc->last_period = hwc->sample_period;
6581                 event->attr.freq = 0;
6582         }
6583 }
6584
6585 /*
6586  * Software event: cpu wall time clock
6587  */
6588
6589 static void cpu_clock_event_update(struct perf_event *event)
6590 {
6591         s64 prev;
6592         u64 now;
6593
6594         now = local_clock();
6595         prev = local64_xchg(&event->hw.prev_count, now);
6596         local64_add(now - prev, &event->count);
6597 }
6598
6599 static void cpu_clock_event_start(struct perf_event *event, int flags)
6600 {
6601         local64_set(&event->hw.prev_count, local_clock());
6602         perf_swevent_start_hrtimer(event);
6603 }
6604
6605 static void cpu_clock_event_stop(struct perf_event *event, int flags)
6606 {
6607         perf_swevent_cancel_hrtimer(event);
6608         cpu_clock_event_update(event);
6609 }
6610
6611 static int cpu_clock_event_add(struct perf_event *event, int flags)
6612 {
6613         if (flags & PERF_EF_START)
6614                 cpu_clock_event_start(event, flags);
6615         perf_event_update_userpage(event);
6616
6617         return 0;
6618 }
6619
6620 static void cpu_clock_event_del(struct perf_event *event, int flags)
6621 {
6622         cpu_clock_event_stop(event, flags);
6623 }
6624
6625 static void cpu_clock_event_read(struct perf_event *event)
6626 {
6627         cpu_clock_event_update(event);
6628 }
6629
6630 static int cpu_clock_event_init(struct perf_event *event)
6631 {
6632         if (event->attr.type != PERF_TYPE_SOFTWARE)
6633                 return -ENOENT;
6634
6635         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6636                 return -ENOENT;
6637
6638         /*
6639          * no branch sampling for software events
6640          */
6641         if (has_branch_stack(event))
6642                 return -EOPNOTSUPP;
6643
6644         perf_swevent_init_hrtimer(event);
6645
6646         return 0;
6647 }
6648
6649 static struct pmu perf_cpu_clock = {
6650         .task_ctx_nr    = perf_sw_context,
6651
6652         .event_init     = cpu_clock_event_init,
6653         .add            = cpu_clock_event_add,
6654         .del            = cpu_clock_event_del,
6655         .start          = cpu_clock_event_start,
6656         .stop           = cpu_clock_event_stop,
6657         .read           = cpu_clock_event_read,
6658 };
6659
6660 /*
6661  * Software event: task time clock
6662  */
6663
6664 static void task_clock_event_update(struct perf_event *event, u64 now)
6665 {
6666         u64 prev;
6667         s64 delta;
6668
6669         prev = local64_xchg(&event->hw.prev_count, now);
6670         delta = now - prev;
6671         local64_add(delta, &event->count);
6672 }
6673
6674 static void task_clock_event_start(struct perf_event *event, int flags)
6675 {
6676         local64_set(&event->hw.prev_count, event->ctx->time);
6677         perf_swevent_start_hrtimer(event);
6678 }
6679
6680 static void task_clock_event_stop(struct perf_event *event, int flags)
6681 {
6682         perf_swevent_cancel_hrtimer(event);
6683         task_clock_event_update(event, event->ctx->time);
6684 }
6685
6686 static int task_clock_event_add(struct perf_event *event, int flags)
6687 {
6688         if (flags & PERF_EF_START)
6689                 task_clock_event_start(event, flags);
6690         perf_event_update_userpage(event);
6691
6692         return 0;
6693 }
6694
6695 static void task_clock_event_del(struct perf_event *event, int flags)
6696 {
6697         task_clock_event_stop(event, PERF_EF_UPDATE);
6698 }
6699
6700 static void task_clock_event_read(struct perf_event *event)
6701 {
6702         u64 now = perf_clock();
6703         u64 delta = now - event->ctx->timestamp;
6704         u64 time = event->ctx->time + delta;
6705
6706         task_clock_event_update(event, time);
6707 }
6708
6709 static int task_clock_event_init(struct perf_event *event)
6710 {
6711         if (event->attr.type != PERF_TYPE_SOFTWARE)
6712                 return -ENOENT;
6713
6714         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
6715                 return -ENOENT;
6716
6717         /*
6718          * no branch sampling for software events
6719          */
6720         if (has_branch_stack(event))
6721                 return -EOPNOTSUPP;
6722
6723         perf_swevent_init_hrtimer(event);
6724
6725         return 0;
6726 }
6727
6728 static struct pmu perf_task_clock = {
6729         .task_ctx_nr    = perf_sw_context,
6730
6731         .event_init     = task_clock_event_init,
6732         .add            = task_clock_event_add,
6733         .del            = task_clock_event_del,
6734         .start          = task_clock_event_start,
6735         .stop           = task_clock_event_stop,
6736         .read           = task_clock_event_read,
6737 };
6738
6739 static void perf_pmu_nop_void(struct pmu *pmu)
6740 {
6741 }
6742
6743 static int perf_pmu_nop_int(struct pmu *pmu)
6744 {
6745         return 0;
6746 }
6747
6748 static void perf_pmu_start_txn(struct pmu *pmu)
6749 {
6750         perf_pmu_disable(pmu);
6751 }
6752
6753 static int perf_pmu_commit_txn(struct pmu *pmu)
6754 {
6755         perf_pmu_enable(pmu);
6756         return 0;
6757 }
6758
6759 static void perf_pmu_cancel_txn(struct pmu *pmu)
6760 {
6761         perf_pmu_enable(pmu);
6762 }
6763
6764 static int perf_event_idx_default(struct perf_event *event)
6765 {
6766         return 0;
6767 }
6768
6769 /*
6770  * Ensures all contexts with the same task_ctx_nr have the same
6771  * pmu_cpu_context too.
6772  */
6773 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
6774 {
6775         struct pmu *pmu;
6776
6777         if (ctxn < 0)
6778                 return NULL;
6779
6780         list_for_each_entry(pmu, &pmus, entry) {
6781                 if (pmu->task_ctx_nr == ctxn)
6782                         return pmu->pmu_cpu_context;
6783         }
6784
6785         return NULL;
6786 }
6787
6788 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
6789 {
6790         int cpu;
6791
6792         for_each_possible_cpu(cpu) {
6793                 struct perf_cpu_context *cpuctx;
6794
6795                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6796
6797                 if (cpuctx->unique_pmu == old_pmu)
6798                         cpuctx->unique_pmu = pmu;
6799         }
6800 }
6801
6802 static void free_pmu_context(struct pmu *pmu)
6803 {
6804         struct pmu *i;
6805
6806         mutex_lock(&pmus_lock);
6807         /*
6808          * Like a real lame refcount.
6809          */
6810         list_for_each_entry(i, &pmus, entry) {
6811                 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
6812                         update_pmu_context(i, pmu);
6813                         goto out;
6814                 }
6815         }
6816
6817         free_percpu(pmu->pmu_cpu_context);
6818 out:
6819         mutex_unlock(&pmus_lock);
6820 }
6821 static struct idr pmu_idr;
6822
6823 static ssize_t
6824 type_show(struct device *dev, struct device_attribute *attr, char *page)
6825 {
6826         struct pmu *pmu = dev_get_drvdata(dev);
6827
6828         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6829 }
6830 static DEVICE_ATTR_RO(type);
6831
6832 static ssize_t
6833 perf_event_mux_interval_ms_show(struct device *dev,
6834                                 struct device_attribute *attr,
6835                                 char *page)
6836 {
6837         struct pmu *pmu = dev_get_drvdata(dev);
6838
6839         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6840 }
6841
6842 static ssize_t
6843 perf_event_mux_interval_ms_store(struct device *dev,
6844                                  struct device_attribute *attr,
6845                                  const char *buf, size_t count)
6846 {
6847         struct pmu *pmu = dev_get_drvdata(dev);
6848         int timer, cpu, ret;
6849
6850         ret = kstrtoint(buf, 0, &timer);
6851         if (ret)
6852                 return ret;
6853
6854         if (timer < 1)
6855                 return -EINVAL;
6856
6857         /* same value, noting to do */
6858         if (timer == pmu->hrtimer_interval_ms)
6859                 return count;
6860
6861         pmu->hrtimer_interval_ms = timer;
6862
6863         /* update all cpuctx for this PMU */
6864         for_each_possible_cpu(cpu) {
6865                 struct perf_cpu_context *cpuctx;
6866                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6867                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6868
6869                 if (hrtimer_active(&cpuctx->hrtimer))
6870                         hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6871         }
6872
6873         return count;
6874 }
6875 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
6876
6877 static struct attribute *pmu_dev_attrs[] = {
6878         &dev_attr_type.attr,
6879         &dev_attr_perf_event_mux_interval_ms.attr,
6880         NULL,
6881 };
6882 ATTRIBUTE_GROUPS(pmu_dev);
6883
6884 static int pmu_bus_running;
6885 static struct bus_type pmu_bus = {
6886         .name           = "event_source",
6887         .dev_groups     = pmu_dev_groups,
6888 };
6889
6890 static void pmu_dev_release(struct device *dev)
6891 {
6892         kfree(dev);
6893 }
6894
6895 static int pmu_dev_alloc(struct pmu *pmu)
6896 {
6897         int ret = -ENOMEM;
6898
6899         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
6900         if (!pmu->dev)
6901                 goto out;
6902
6903         pmu->dev->groups = pmu->attr_groups;
6904         device_initialize(pmu->dev);
6905         ret = dev_set_name(pmu->dev, "%s", pmu->name);
6906         if (ret)
6907                 goto free_dev;
6908
6909         dev_set_drvdata(pmu->dev, pmu);
6910         pmu->dev->bus = &pmu_bus;
6911         pmu->dev->release = pmu_dev_release;
6912         ret = device_add(pmu->dev);
6913         if (ret)
6914                 goto free_dev;
6915
6916 out:
6917         return ret;
6918
6919 free_dev:
6920         put_device(pmu->dev);
6921         goto out;
6922 }
6923
6924 static struct lock_class_key cpuctx_mutex;
6925 static struct lock_class_key cpuctx_lock;
6926
6927 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
6928 {
6929         int cpu, ret;
6930
6931         mutex_lock(&pmus_lock);
6932         ret = -ENOMEM;
6933         pmu->pmu_disable_count = alloc_percpu(int);
6934         if (!pmu->pmu_disable_count)
6935                 goto unlock;
6936
6937         pmu->type = -1;
6938         if (!name)
6939                 goto skip_type;
6940         pmu->name = name;
6941
6942         if (type < 0) {
6943                 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
6944                 if (type < 0) {
6945                         ret = type;
6946                         goto free_pdc;
6947                 }
6948         }
6949         pmu->type = type;
6950
6951         if (pmu_bus_running) {
6952                 ret = pmu_dev_alloc(pmu);
6953                 if (ret)
6954                         goto free_idr;
6955         }
6956
6957 skip_type:
6958         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6959         if (pmu->pmu_cpu_context)
6960                 goto got_cpu_context;
6961
6962         ret = -ENOMEM;
6963         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6964         if (!pmu->pmu_cpu_context)
6965                 goto free_dev;
6966
6967         for_each_possible_cpu(cpu) {
6968                 struct perf_cpu_context *cpuctx;
6969
6970                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6971                 __perf_event_init_context(&cpuctx->ctx);
6972                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6973                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6974                 cpuctx->ctx.pmu = pmu;
6975
6976                 __perf_cpu_hrtimer_init(cpuctx, cpu);
6977
6978                 cpuctx->unique_pmu = pmu;
6979         }
6980
6981 got_cpu_context:
6982         if (!pmu->start_txn) {
6983                 if (pmu->pmu_enable) {
6984                         /*
6985                          * If we have pmu_enable/pmu_disable calls, install
6986                          * transaction stubs that use that to try and batch
6987                          * hardware accesses.
6988                          */
6989                         pmu->start_txn  = perf_pmu_start_txn;
6990                         pmu->commit_txn = perf_pmu_commit_txn;
6991                         pmu->cancel_txn = perf_pmu_cancel_txn;
6992                 } else {
6993                         pmu->start_txn  = perf_pmu_nop_void;
6994                         pmu->commit_txn = perf_pmu_nop_int;
6995                         pmu->cancel_txn = perf_pmu_nop_void;
6996                 }
6997         }
6998
6999         if (!pmu->pmu_enable) {
7000                 pmu->pmu_enable  = perf_pmu_nop_void;
7001                 pmu->pmu_disable = perf_pmu_nop_void;
7002         }
7003
7004         if (!pmu->event_idx)
7005                 pmu->event_idx = perf_event_idx_default;
7006
7007         list_add_rcu(&pmu->entry, &pmus);
7008         ret = 0;
7009 unlock:
7010         mutex_unlock(&pmus_lock);
7011
7012         return ret;
7013
7014 free_dev:
7015         device_del(pmu->dev);
7016         put_device(pmu->dev);
7017
7018 free_idr:
7019         if (pmu->type >= PERF_TYPE_MAX)
7020                 idr_remove(&pmu_idr, pmu->type);
7021
7022 free_pdc:
7023         free_percpu(pmu->pmu_disable_count);
7024         goto unlock;
7025 }
7026 EXPORT_SYMBOL_GPL(perf_pmu_register);
7027
7028 void perf_pmu_unregister(struct pmu *pmu)
7029 {
7030         mutex_lock(&pmus_lock);
7031         list_del_rcu(&pmu->entry);
7032         mutex_unlock(&pmus_lock);
7033
7034         /*
7035          * We dereference the pmu list under both SRCU and regular RCU, so
7036          * synchronize against both of those.
7037          */
7038         synchronize_srcu(&pmus_srcu);
7039         synchronize_rcu();
7040
7041         free_percpu(pmu->pmu_disable_count);
7042         if (pmu->type >= PERF_TYPE_MAX)
7043                 idr_remove(&pmu_idr, pmu->type);
7044         device_del(pmu->dev);
7045         put_device(pmu->dev);
7046         free_pmu_context(pmu);
7047 }
7048 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7049
7050 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7051 {
7052         int ret;
7053
7054         if (!try_module_get(pmu->module))
7055                 return -ENODEV;
7056         event->pmu = pmu;
7057         ret = pmu->event_init(event);
7058         if (ret)
7059                 module_put(pmu->module);
7060
7061         return ret;
7062 }
7063
7064 struct pmu *perf_init_event(struct perf_event *event)
7065 {
7066         struct pmu *pmu = NULL;
7067         int idx;
7068         int ret;
7069
7070         idx = srcu_read_lock(&pmus_srcu);
7071
7072         rcu_read_lock();
7073         pmu = idr_find(&pmu_idr, event->attr.type);
7074         rcu_read_unlock();
7075         if (pmu) {
7076                 ret = perf_try_init_event(pmu, event);
7077                 if (ret)
7078                         pmu = ERR_PTR(ret);
7079                 goto unlock;
7080         }
7081
7082         list_for_each_entry_rcu(pmu, &pmus, entry) {
7083                 ret = perf_try_init_event(pmu, event);
7084                 if (!ret)
7085                         goto unlock;
7086
7087                 if (ret != -ENOENT) {
7088                         pmu = ERR_PTR(ret);
7089                         goto unlock;
7090                 }
7091         }
7092         pmu = ERR_PTR(-ENOENT);
7093 unlock:
7094         srcu_read_unlock(&pmus_srcu, idx);
7095
7096         return pmu;
7097 }
7098
7099 static void account_event_cpu(struct perf_event *event, int cpu)
7100 {
7101         if (event->parent)
7102                 return;
7103
7104         if (is_cgroup_event(event))
7105                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7106 }
7107
7108 static void account_event(struct perf_event *event)
7109 {
7110         if (event->parent)
7111                 return;
7112
7113         if (event->attach_state & PERF_ATTACH_TASK)
7114                 static_key_slow_inc(&perf_sched_events.key);
7115         if (event->attr.mmap || event->attr.mmap_data)
7116                 atomic_inc(&nr_mmap_events);
7117         if (event->attr.comm)
7118                 atomic_inc(&nr_comm_events);
7119         if (event->attr.task)
7120                 atomic_inc(&nr_task_events);
7121         if (event->attr.freq) {
7122                 if (atomic_inc_return(&nr_freq_events) == 1)
7123                         tick_nohz_full_kick_all();
7124         }
7125         if (has_branch_stack(event))
7126                 static_key_slow_inc(&perf_sched_events.key);
7127         if (is_cgroup_event(event))
7128                 static_key_slow_inc(&perf_sched_events.key);
7129
7130         account_event_cpu(event, event->cpu);
7131 }
7132
7133 /*
7134  * Allocate and initialize a event structure
7135  */
7136 static struct perf_event *
7137 perf_event_alloc(struct perf_event_attr *attr, int cpu,
7138                  struct task_struct *task,
7139                  struct perf_event *group_leader,
7140                  struct perf_event *parent_event,
7141                  perf_overflow_handler_t overflow_handler,
7142                  void *context)
7143 {
7144         struct pmu *pmu;
7145         struct perf_event *event;
7146         struct hw_perf_event *hwc;
7147         long err = -EINVAL;
7148
7149         if ((unsigned)cpu >= nr_cpu_ids) {
7150                 if (!task || cpu != -1)
7151                         return ERR_PTR(-EINVAL);
7152         }
7153
7154         event = kzalloc(sizeof(*event), GFP_KERNEL);
7155         if (!event)
7156                 return ERR_PTR(-ENOMEM);
7157
7158         /*
7159          * Single events are their own group leaders, with an
7160          * empty sibling list:
7161          */
7162         if (!group_leader)
7163                 group_leader = event;
7164
7165         mutex_init(&event->child_mutex);
7166         INIT_LIST_HEAD(&event->child_list);
7167
7168         INIT_LIST_HEAD(&event->group_entry);
7169         INIT_LIST_HEAD(&event->event_entry);
7170         INIT_LIST_HEAD(&event->sibling_list);
7171         INIT_LIST_HEAD(&event->rb_entry);
7172         INIT_LIST_HEAD(&event->active_entry);
7173         INIT_HLIST_NODE(&event->hlist_entry);
7174
7175
7176         init_waitqueue_head(&event->waitq);
7177         init_irq_work(&event->pending, perf_pending_event);
7178
7179         mutex_init(&event->mmap_mutex);
7180
7181         atomic_long_set(&event->refcount, 1);
7182         event->cpu              = cpu;
7183         event->attr             = *attr;
7184         event->group_leader     = group_leader;
7185         event->pmu              = NULL;
7186         event->oncpu            = -1;
7187
7188         event->parent           = parent_event;
7189
7190         event->ns               = get_pid_ns(task_active_pid_ns(current));
7191         event->id               = atomic64_inc_return(&perf_event_id);
7192
7193         event->state            = PERF_EVENT_STATE_INACTIVE;
7194
7195         if (task) {
7196                 event->attach_state = PERF_ATTACH_TASK;
7197
7198                 if (attr->type == PERF_TYPE_TRACEPOINT)
7199                         event->hw.tp_target = task;
7200 #ifdef CONFIG_HAVE_HW_BREAKPOINT
7201                 /*
7202                  * hw_breakpoint is a bit difficult here..
7203                  */
7204                 else if (attr->type == PERF_TYPE_BREAKPOINT)
7205                         event->hw.bp_target = task;
7206 #endif
7207         }
7208
7209         if (!overflow_handler && parent_event) {
7210                 overflow_handler = parent_event->overflow_handler;
7211                 context = parent_event->overflow_handler_context;
7212         }
7213
7214         event->overflow_handler = overflow_handler;
7215         event->overflow_handler_context = context;
7216
7217         perf_event__state_init(event);
7218
7219         pmu = NULL;
7220
7221         hwc = &event->hw;
7222         hwc->sample_period = attr->sample_period;
7223         if (attr->freq && attr->sample_freq)
7224                 hwc->sample_period = 1;
7225         hwc->last_period = hwc->sample_period;
7226
7227         local64_set(&hwc->period_left, hwc->sample_period);
7228
7229         /*
7230          * we currently do not support PERF_FORMAT_GROUP on inherited events
7231          */
7232         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7233                 goto err_ns;
7234
7235         if (!has_branch_stack(event))
7236                 event->attr.branch_sample_type = 0;
7237
7238         pmu = perf_init_event(event);
7239         if (!pmu)
7240                 goto err_ns;
7241         else if (IS_ERR(pmu)) {
7242                 err = PTR_ERR(pmu);
7243                 goto err_ns;
7244         }
7245
7246         if (!event->parent) {
7247                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7248                         err = get_callchain_buffers();
7249                         if (err)
7250                                 goto err_pmu;
7251                 }
7252         }
7253
7254         return event;
7255
7256 err_pmu:
7257         if (event->destroy)
7258                 event->destroy(event);
7259         module_put(pmu->module);
7260 err_ns:
7261         if (event->ns)
7262                 put_pid_ns(event->ns);
7263         kfree(event);
7264
7265         return ERR_PTR(err);
7266 }
7267
7268 static int perf_copy_attr(struct perf_event_attr __user *uattr,
7269                           struct perf_event_attr *attr)
7270 {
7271         u32 size;
7272         int ret;
7273
7274         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7275                 return -EFAULT;
7276
7277         /*
7278          * zero the full structure, so that a short copy will be nice.
7279          */
7280         memset(attr, 0, sizeof(*attr));
7281
7282         ret = get_user(size, &uattr->size);
7283         if (ret)
7284                 return ret;
7285
7286         if (size > PAGE_SIZE)   /* silly large */
7287                 goto err_size;
7288
7289         if (!size)              /* abi compat */
7290                 size = PERF_ATTR_SIZE_VER0;
7291
7292         if (size < PERF_ATTR_SIZE_VER0)
7293                 goto err_size;
7294
7295         /*
7296          * If we're handed a bigger struct than we know of,
7297          * ensure all the unknown bits are 0 - i.e. new
7298          * user-space does not rely on any kernel feature
7299          * extensions we dont know about yet.
7300          */
7301         if (size > sizeof(*attr)) {
7302                 unsigned char __user *addr;
7303                 unsigned char __user *end;
7304                 unsigned char val;
7305
7306                 addr = (void __user *)uattr + sizeof(*attr);
7307                 end  = (void __user *)uattr + size;
7308
7309                 for (; addr < end; addr++) {
7310                         ret = get_user(val, addr);
7311                         if (ret)
7312                                 return ret;
7313                         if (val)
7314                                 goto err_size;
7315                 }
7316                 size = sizeof(*attr);
7317         }
7318
7319         ret = copy_from_user(attr, uattr, size);
7320         if (ret)
7321                 return -EFAULT;
7322
7323         if (attr->__reserved_1)
7324                 return -EINVAL;
7325
7326         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
7327                 return -EINVAL;
7328
7329         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
7330                 return -EINVAL;
7331
7332         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
7333                 u64 mask = attr->branch_sample_type;
7334
7335                 /* only using defined bits */
7336                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
7337                         return -EINVAL;
7338
7339                 /* at least one branch bit must be set */
7340                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
7341                         return -EINVAL;
7342
7343                 /* propagate priv level, when not set for branch */
7344                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
7345
7346                         /* exclude_kernel checked on syscall entry */
7347                         if (!attr->exclude_kernel)
7348                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
7349
7350                         if (!attr->exclude_user)
7351                                 mask |= PERF_SAMPLE_BRANCH_USER;
7352
7353                         if (!attr->exclude_hv)
7354                                 mask |= PERF_SAMPLE_BRANCH_HV;
7355                         /*
7356                          * adjust user setting (for HW filter setup)
7357                          */
7358                         attr->branch_sample_type = mask;
7359                 }
7360                 /* privileged levels capture (kernel, hv): check permissions */
7361                 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
7362                     && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7363                         return -EACCES;
7364         }
7365
7366         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
7367                 ret = perf_reg_validate(attr->sample_regs_user);
7368                 if (ret)
7369                         return ret;
7370         }
7371
7372         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
7373                 if (!arch_perf_have_user_stack_dump())
7374                         return -ENOSYS;
7375
7376                 /*
7377                  * We have __u32 type for the size, but so far
7378                  * we can only use __u16 as maximum due to the
7379                  * __u16 sample size limit.
7380                  */
7381                 if (attr->sample_stack_user >= USHRT_MAX)
7382                         ret = -EINVAL;
7383                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
7384                         ret = -EINVAL;
7385         }
7386
7387         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7388                 ret = perf_reg_validate(attr->sample_regs_intr);
7389 out:
7390         return ret;
7391
7392 err_size:
7393         put_user(sizeof(*attr), &uattr->size);
7394         ret = -E2BIG;
7395         goto out;
7396 }
7397
7398 static int
7399 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7400 {
7401         struct ring_buffer *rb = NULL;
7402         int ret = -EINVAL;
7403
7404         if (!output_event)
7405                 goto set;
7406
7407         /* don't allow circular references */
7408         if (event == output_event)
7409                 goto out;
7410
7411         /*
7412          * Don't allow cross-cpu buffers
7413          */
7414         if (output_event->cpu != event->cpu)
7415                 goto out;
7416
7417         /*
7418          * If its not a per-cpu rb, it must be the same task.
7419          */
7420         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7421                 goto out;
7422
7423 set:
7424         mutex_lock(&event->mmap_mutex);
7425         /* Can't redirect output if we've got an active mmap() */
7426         if (atomic_read(&event->mmap_count))
7427                 goto unlock;
7428
7429         if (output_event) {
7430                 /* get the rb we want to redirect to */
7431                 rb = ring_buffer_get(output_event);
7432                 if (!rb)
7433                         goto unlock;
7434         }
7435
7436         ring_buffer_attach(event, rb);
7437
7438         ret = 0;
7439 unlock:
7440         mutex_unlock(&event->mmap_mutex);
7441
7442 out:
7443         return ret;
7444 }
7445
7446 static void mutex_lock_double(struct mutex *a, struct mutex *b)
7447 {
7448         if (b < a)
7449                 swap(a, b);
7450
7451         mutex_lock(a);
7452         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7453 }
7454
7455 /**
7456  * sys_perf_event_open - open a performance event, associate it to a task/cpu
7457  *
7458  * @attr_uptr:  event_id type attributes for monitoring/sampling
7459  * @pid:                target pid
7460  * @cpu:                target cpu
7461  * @group_fd:           group leader event fd
7462  */
7463 SYSCALL_DEFINE5(perf_event_open,
7464                 struct perf_event_attr __user *, attr_uptr,
7465                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
7466 {
7467         struct perf_event *group_leader = NULL, *output_event = NULL;
7468         struct perf_event *event, *sibling;
7469         struct perf_event_attr attr;
7470         struct perf_event_context *ctx, *uninitialized_var(gctx);
7471         struct file *event_file = NULL;
7472         struct fd group = {NULL, 0};
7473         struct task_struct *task = NULL;
7474         struct pmu *pmu;
7475         int event_fd;
7476         int move_group = 0;
7477         int err;
7478         int f_flags = O_RDWR;
7479
7480         /* for future expandability... */
7481         if (flags & ~PERF_FLAG_ALL)
7482                 return -EINVAL;
7483
7484         err = perf_copy_attr(attr_uptr, &attr);
7485         if (err)
7486                 return err;
7487
7488         if (!attr.exclude_kernel) {
7489                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7490                         return -EACCES;
7491         }
7492
7493         if (attr.freq) {
7494                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7495                         return -EINVAL;
7496         } else {
7497                 if (attr.sample_period & (1ULL << 63))
7498                         return -EINVAL;
7499         }
7500
7501         /*
7502          * In cgroup mode, the pid argument is used to pass the fd
7503          * opened to the cgroup directory in cgroupfs. The cpu argument
7504          * designates the cpu on which to monitor threads from that
7505          * cgroup.
7506          */
7507         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7508                 return -EINVAL;
7509
7510         if (flags & PERF_FLAG_FD_CLOEXEC)
7511                 f_flags |= O_CLOEXEC;
7512
7513         event_fd = get_unused_fd_flags(f_flags);
7514         if (event_fd < 0)
7515                 return event_fd;
7516
7517         if (group_fd != -1) {
7518                 err = perf_fget_light(group_fd, &group);
7519                 if (err)
7520                         goto err_fd;
7521                 group_leader = group.file->private_data;
7522                 if (flags & PERF_FLAG_FD_OUTPUT)
7523                         output_event = group_leader;
7524                 if (flags & PERF_FLAG_FD_NO_GROUP)
7525                         group_leader = NULL;
7526         }
7527
7528         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
7529                 task = find_lively_task_by_vpid(pid);
7530                 if (IS_ERR(task)) {
7531                         err = PTR_ERR(task);
7532                         goto err_group_fd;
7533                 }
7534         }
7535
7536         if (task && group_leader &&
7537             group_leader->attr.inherit != attr.inherit) {
7538                 err = -EINVAL;
7539                 goto err_task;
7540         }
7541
7542         get_online_cpus();
7543
7544         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7545                                  NULL, NULL);
7546         if (IS_ERR(event)) {
7547                 err = PTR_ERR(event);
7548                 goto err_cpus;
7549         }
7550
7551         if (flags & PERF_FLAG_PID_CGROUP) {
7552                 err = perf_cgroup_connect(pid, event, &attr, group_leader);
7553                 if (err) {
7554                         __free_event(event);
7555                         goto err_cpus;
7556                 }
7557         }
7558
7559         if (is_sampling_event(event)) {
7560                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7561                         err = -ENOTSUPP;
7562                         goto err_alloc;
7563                 }
7564         }
7565
7566         account_event(event);
7567
7568         /*
7569          * Special case software events and allow them to be part of
7570          * any hardware group.
7571          */
7572         pmu = event->pmu;
7573
7574         if (group_leader &&
7575             (is_software_event(event) != is_software_event(group_leader))) {
7576                 if (is_software_event(event)) {
7577                         /*
7578                          * If event and group_leader are not both a software
7579                          * event, and event is, then group leader is not.
7580                          *
7581                          * Allow the addition of software events to !software
7582                          * groups, this is safe because software events never
7583                          * fail to schedule.
7584                          */
7585                         pmu = group_leader->pmu;
7586                 } else if (is_software_event(group_leader) &&
7587                            (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7588                         /*
7589                          * In case the group is a pure software group, and we
7590                          * try to add a hardware event, move the whole group to
7591                          * the hardware context.
7592                          */
7593                         move_group = 1;
7594                 }
7595         }
7596
7597         /*
7598          * Get the target context (task or percpu):
7599          */
7600         ctx = find_get_context(pmu, task, event);
7601         if (IS_ERR(ctx)) {
7602                 err = PTR_ERR(ctx);
7603                 goto err_alloc;
7604         }
7605
7606         if (task) {
7607                 put_task_struct(task);
7608                 task = NULL;
7609         }
7610
7611         /*
7612          * Look up the group leader (we will attach this event to it):
7613          */
7614         if (group_leader) {
7615                 err = -EINVAL;
7616
7617                 /*
7618                  * Do not allow a recursive hierarchy (this new sibling
7619                  * becoming part of another group-sibling):
7620                  */
7621                 if (group_leader->group_leader != group_leader)
7622                         goto err_context;
7623                 /*
7624                  * Do not allow to attach to a group in a different
7625                  * task or CPU context:
7626                  */
7627                 if (move_group) {
7628                         /*
7629                          * Make sure we're both on the same task, or both
7630                          * per-cpu events.
7631                          */
7632                         if (group_leader->ctx->task != ctx->task)
7633                                 goto err_context;
7634
7635                         /*
7636                          * Make sure we're both events for the same CPU;
7637                          * grouping events for different CPUs is broken; since
7638                          * you can never concurrently schedule them anyhow.
7639                          */
7640                         if (group_leader->cpu != event->cpu)
7641                                 goto err_context;
7642                 } else {
7643                         if (group_leader->ctx != ctx)
7644                                 goto err_context;
7645                 }
7646
7647                 /*
7648                  * Only a group leader can be exclusive or pinned
7649                  */
7650                 if (attr.exclusive || attr.pinned)
7651                         goto err_context;
7652         }
7653
7654         if (output_event) {
7655                 err = perf_event_set_output(event, output_event);
7656                 if (err)
7657                         goto err_context;
7658         }
7659
7660         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
7661                                         f_flags);
7662         if (IS_ERR(event_file)) {
7663                 err = PTR_ERR(event_file);
7664                 goto err_context;
7665         }
7666
7667         if (move_group) {
7668                 gctx = group_leader->ctx;
7669
7670                 /*
7671                  * See perf_event_ctx_lock() for comments on the details
7672                  * of swizzling perf_event::ctx.
7673                  */
7674                 mutex_lock_double(&gctx->mutex, &ctx->mutex);
7675
7676                 perf_remove_from_context(group_leader, false);
7677
7678                 list_for_each_entry(sibling, &group_leader->sibling_list,
7679                                     group_entry) {
7680                         perf_remove_from_context(sibling, false);
7681                         put_ctx(gctx);
7682                 }
7683         } else {
7684                 mutex_lock(&ctx->mutex);
7685         }
7686
7687         WARN_ON_ONCE(ctx->parent_ctx);
7688
7689         if (move_group) {
7690                 /*
7691                  * Wait for everybody to stop referencing the events through
7692                  * the old lists, before installing it on new lists.
7693                  */
7694                 synchronize_rcu();
7695
7696                 /*
7697                  * Install the group siblings before the group leader.
7698                  *
7699                  * Because a group leader will try and install the entire group
7700                  * (through the sibling list, which is still in-tact), we can
7701                  * end up with siblings installed in the wrong context.
7702                  *
7703                  * By installing siblings first we NO-OP because they're not
7704                  * reachable through the group lists.
7705                  */
7706                 list_for_each_entry(sibling, &group_leader->sibling_list,
7707                                     group_entry) {
7708                         perf_event__state_init(sibling);
7709                         perf_install_in_context(ctx, sibling, sibling->cpu);
7710                         get_ctx(ctx);
7711                 }
7712
7713                 /*
7714                  * Removing from the context ends up with disabled
7715                  * event. What we want here is event in the initial
7716                  * startup state, ready to be add into new context.
7717                  */
7718                 perf_event__state_init(group_leader);
7719                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7720                 get_ctx(ctx);
7721         }
7722
7723         perf_install_in_context(ctx, event, event->cpu);
7724         perf_unpin_context(ctx);
7725
7726         if (move_group) {
7727                 mutex_unlock(&gctx->mutex);
7728                 put_ctx(gctx);
7729         }
7730         mutex_unlock(&ctx->mutex);
7731
7732         put_online_cpus();
7733
7734         event->owner = current;
7735
7736         mutex_lock(&current->perf_event_mutex);
7737         list_add_tail(&event->owner_entry, &current->perf_event_list);
7738         mutex_unlock(&current->perf_event_mutex);
7739
7740         /*
7741          * Precalculate sample_data sizes
7742          */
7743         perf_event__header_size(event);
7744         perf_event__id_header_size(event);
7745
7746         /*
7747          * Drop the reference on the group_event after placing the
7748          * new event on the sibling_list. This ensures destruction
7749          * of the group leader will find the pointer to itself in
7750          * perf_group_detach().
7751          */
7752         fdput(group);
7753         fd_install(event_fd, event_file);
7754         return event_fd;
7755
7756 err_context:
7757         perf_unpin_context(ctx);
7758         put_ctx(ctx);
7759 err_alloc:
7760         free_event(event);
7761 err_cpus:
7762         put_online_cpus();
7763 err_task:
7764         if (task)
7765                 put_task_struct(task);
7766 err_group_fd:
7767         fdput(group);
7768 err_fd:
7769         put_unused_fd(event_fd);
7770         return err;
7771 }
7772
7773 /**
7774  * perf_event_create_kernel_counter
7775  *
7776  * @attr: attributes of the counter to create
7777  * @cpu: cpu in which the counter is bound
7778  * @task: task to profile (NULL for percpu)
7779  */
7780 struct perf_event *
7781 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7782                                  struct task_struct *task,
7783                                  perf_overflow_handler_t overflow_handler,
7784                                  void *context)
7785 {
7786         struct perf_event_context *ctx;
7787         struct perf_event *event;
7788         int err;
7789
7790         /*
7791          * Get the target context (task or percpu):
7792          */
7793
7794         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
7795                                  overflow_handler, context);
7796         if (IS_ERR(event)) {
7797                 err = PTR_ERR(event);
7798                 goto err;
7799         }
7800
7801         /* Mark owner so we could distinguish it from user events. */
7802         event->owner = EVENT_OWNER_KERNEL;
7803
7804         account_event(event);
7805
7806         ctx = find_get_context(event->pmu, task, event);
7807         if (IS_ERR(ctx)) {
7808                 err = PTR_ERR(ctx);
7809                 goto err_free;
7810         }
7811
7812         WARN_ON_ONCE(ctx->parent_ctx);
7813         mutex_lock(&ctx->mutex);
7814         perf_install_in_context(ctx, event, cpu);
7815         perf_unpin_context(ctx);
7816         mutex_unlock(&ctx->mutex);
7817
7818         return event;
7819
7820 err_free:
7821         free_event(event);
7822 err:
7823         return ERR_PTR(err);
7824 }
7825 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
7826
7827 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7828 {
7829         struct perf_event_context *src_ctx;
7830         struct perf_event_context *dst_ctx;
7831         struct perf_event *event, *tmp;
7832         LIST_HEAD(events);
7833
7834         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
7835         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
7836
7837         /*
7838          * See perf_event_ctx_lock() for comments on the details
7839          * of swizzling perf_event::ctx.
7840          */
7841         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
7842         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7843                                  event_entry) {
7844                 perf_remove_from_context(event, false);
7845                 unaccount_event_cpu(event, src_cpu);
7846                 put_ctx(src_ctx);
7847                 list_add(&event->migrate_entry, &events);
7848         }
7849
7850         /*
7851          * Wait for the events to quiesce before re-instating them.
7852          */
7853         synchronize_rcu();
7854
7855         /*
7856          * Re-instate events in 2 passes.
7857          *
7858          * Skip over group leaders and only install siblings on this first
7859          * pass, siblings will not get enabled without a leader, however a
7860          * leader will enable its siblings, even if those are still on the old
7861          * context.
7862          */
7863         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7864                 if (event->group_leader == event)
7865                         continue;
7866
7867                 list_del(&event->migrate_entry);
7868                 if (event->state >= PERF_EVENT_STATE_OFF)
7869                         event->state = PERF_EVENT_STATE_INACTIVE;
7870                 account_event_cpu(event, dst_cpu);
7871                 perf_install_in_context(dst_ctx, event, dst_cpu);
7872                 get_ctx(dst_ctx);
7873         }
7874
7875         /*
7876          * Once all the siblings are setup properly, install the group leaders
7877          * to make it go.
7878          */
7879         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7880                 list_del(&event->migrate_entry);
7881                 if (event->state >= PERF_EVENT_STATE_OFF)
7882                         event->state = PERF_EVENT_STATE_INACTIVE;
7883                 account_event_cpu(event, dst_cpu);
7884                 perf_install_in_context(dst_ctx, event, dst_cpu);
7885                 get_ctx(dst_ctx);
7886         }
7887         mutex_unlock(&dst_ctx->mutex);
7888         mutex_unlock(&src_ctx->mutex);
7889 }
7890 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
7891
7892 static void sync_child_event(struct perf_event *child_event,
7893                                struct task_struct *child)
7894 {
7895         struct perf_event *parent_event = child_event->parent;
7896         u64 child_val;
7897
7898         if (child_event->attr.inherit_stat)
7899                 perf_event_read_event(child_event, child);
7900
7901         child_val = perf_event_count(child_event);
7902
7903         /*
7904          * Add back the child's count to the parent's count:
7905          */
7906         atomic64_add(child_val, &parent_event->child_count);
7907         atomic64_add(child_event->total_time_enabled,
7908                      &parent_event->child_total_time_enabled);
7909         atomic64_add(child_event->total_time_running,
7910                      &parent_event->child_total_time_running);
7911
7912         /*
7913          * Remove this event from the parent's list
7914          */
7915         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7916         mutex_lock(&parent_event->child_mutex);
7917         list_del_init(&child_event->child_list);
7918         mutex_unlock(&parent_event->child_mutex);
7919
7920         /*
7921          * Make sure user/parent get notified, that we just
7922          * lost one event.
7923          */
7924         perf_event_wakeup(parent_event);
7925
7926         /*
7927          * Release the parent event, if this was the last
7928          * reference to it.
7929          */
7930         put_event(parent_event);
7931 }
7932
7933 static void
7934 __perf_event_exit_task(struct perf_event *child_event,
7935                          struct perf_event_context *child_ctx,
7936                          struct task_struct *child)
7937 {
7938         /*
7939          * Do not destroy the 'original' grouping; because of the context
7940          * switch optimization the original events could've ended up in a
7941          * random child task.
7942          *
7943          * If we were to destroy the original group, all group related
7944          * operations would cease to function properly after this random
7945          * child dies.
7946          *
7947          * Do destroy all inherited groups, we don't care about those
7948          * and being thorough is better.
7949          */
7950         perf_remove_from_context(child_event, !!child_event->parent);
7951
7952         /*
7953          * It can happen that the parent exits first, and has events
7954          * that are still around due to the child reference. These
7955          * events need to be zapped.
7956          */
7957         if (child_event->parent) {
7958                 sync_child_event(child_event, child);
7959                 free_event(child_event);
7960         } else {
7961                 child_event->state = PERF_EVENT_STATE_EXIT;
7962                 perf_event_wakeup(child_event);
7963         }
7964 }
7965
7966 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7967 {
7968         struct perf_event *child_event, *next;
7969         struct perf_event_context *child_ctx, *clone_ctx = NULL;
7970         unsigned long flags;
7971
7972         if (likely(!child->perf_event_ctxp[ctxn])) {
7973                 perf_event_task(child, NULL, 0);
7974                 return;
7975         }
7976
7977         local_irq_save(flags);
7978         /*
7979          * We can't reschedule here because interrupts are disabled,
7980          * and either child is current or it is a task that can't be
7981          * scheduled, so we are now safe from rescheduling changing
7982          * our context.
7983          */
7984         child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
7985
7986         /*
7987          * Take the context lock here so that if find_get_context is
7988          * reading child->perf_event_ctxp, we wait until it has
7989          * incremented the context's refcount before we do put_ctx below.
7990          */
7991         raw_spin_lock(&child_ctx->lock);
7992         task_ctx_sched_out(child_ctx);
7993         child->perf_event_ctxp[ctxn] = NULL;
7994
7995         /*
7996          * If this context is a clone; unclone it so it can't get
7997          * swapped to another process while we're removing all
7998          * the events from it.
7999          */
8000         clone_ctx = unclone_ctx(child_ctx);
8001         update_context_time(child_ctx);
8002         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8003
8004         if (clone_ctx)
8005                 put_ctx(clone_ctx);
8006
8007         /*
8008          * Report the task dead after unscheduling the events so that we
8009          * won't get any samples after PERF_RECORD_EXIT. We can however still
8010          * get a few PERF_RECORD_READ events.
8011          */
8012         perf_event_task(child, child_ctx, 0);
8013
8014         /*
8015          * We can recurse on the same lock type through:
8016          *
8017          *   __perf_event_exit_task()
8018          *     sync_child_event()
8019          *       put_event()
8020          *         mutex_lock(&ctx->mutex)
8021          *
8022          * But since its the parent context it won't be the same instance.
8023          */
8024         mutex_lock(&child_ctx->mutex);
8025
8026         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8027                 __perf_event_exit_task(child_event, child_ctx, child);
8028
8029         mutex_unlock(&child_ctx->mutex);
8030
8031         put_ctx(child_ctx);
8032 }
8033
8034 /*
8035  * When a child task exits, feed back event values to parent events.
8036  */
8037 void perf_event_exit_task(struct task_struct *child)
8038 {
8039         struct perf_event *event, *tmp;
8040         int ctxn;
8041
8042         mutex_lock(&child->perf_event_mutex);
8043         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8044                                  owner_entry) {
8045                 list_del_init(&event->owner_entry);
8046
8047                 /*
8048                  * Ensure the list deletion is visible before we clear
8049                  * the owner, closes a race against perf_release() where
8050                  * we need to serialize on the owner->perf_event_mutex.
8051                  */
8052                 smp_wmb();
8053                 event->owner = NULL;
8054         }
8055         mutex_unlock(&child->perf_event_mutex);
8056
8057         for_each_task_context_nr(ctxn)
8058                 perf_event_exit_task_context(child, ctxn);
8059 }
8060
8061 static void perf_free_event(struct perf_event *event,
8062                             struct perf_event_context *ctx)
8063 {
8064         struct perf_event *parent = event->parent;
8065
8066         if (WARN_ON_ONCE(!parent))
8067                 return;
8068
8069         mutex_lock(&parent->child_mutex);
8070         list_del_init(&event->child_list);
8071         mutex_unlock(&parent->child_mutex);
8072
8073         put_event(parent);
8074
8075         raw_spin_lock_irq(&ctx->lock);
8076         perf_group_detach(event);
8077         list_del_event(event, ctx);
8078         raw_spin_unlock_irq(&ctx->lock);
8079         free_event(event);
8080 }
8081
8082 /*
8083  * Free an unexposed, unused context as created by inheritance by
8084  * perf_event_init_task below, used by fork() in case of fail.
8085  *
8086  * Not all locks are strictly required, but take them anyway to be nice and
8087  * help out with the lockdep assertions.
8088  */
8089 void perf_event_free_task(struct task_struct *task)
8090 {
8091         struct perf_event_context *ctx;
8092         struct perf_event *event, *tmp;
8093         int ctxn;
8094
8095         for_each_task_context_nr(ctxn) {
8096                 ctx = task->perf_event_ctxp[ctxn];
8097                 if (!ctx)
8098                         continue;
8099
8100                 mutex_lock(&ctx->mutex);
8101 again:
8102                 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8103                                 group_entry)
8104                         perf_free_event(event, ctx);
8105
8106                 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8107                                 group_entry)
8108                         perf_free_event(event, ctx);
8109
8110                 if (!list_empty(&ctx->pinned_groups) ||
8111                                 !list_empty(&ctx->flexible_groups))
8112                         goto again;
8113
8114                 mutex_unlock(&ctx->mutex);
8115
8116                 put_ctx(ctx);
8117         }
8118 }
8119
8120 void perf_event_delayed_put(struct task_struct *task)
8121 {
8122         int ctxn;
8123
8124         for_each_task_context_nr(ctxn)
8125                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8126 }
8127
8128 /*
8129  * inherit a event from parent task to child task:
8130  */
8131 static struct perf_event *
8132 inherit_event(struct perf_event *parent_event,
8133               struct task_struct *parent,
8134               struct perf_event_context *parent_ctx,
8135               struct task_struct *child,
8136               struct perf_event *group_leader,
8137               struct perf_event_context *child_ctx)
8138 {
8139         enum perf_event_active_state parent_state = parent_event->state;
8140         struct perf_event *child_event;
8141         unsigned long flags;
8142
8143         /*
8144          * Instead of creating recursive hierarchies of events,
8145          * we link inherited events back to the original parent,
8146          * which has a filp for sure, which we use as the reference
8147          * count:
8148          */
8149         if (parent_event->parent)
8150                 parent_event = parent_event->parent;
8151
8152         child_event = perf_event_alloc(&parent_event->attr,
8153                                            parent_event->cpu,
8154                                            child,
8155                                            group_leader, parent_event,
8156                                            NULL, NULL);
8157         if (IS_ERR(child_event))
8158                 return child_event;
8159
8160         if (is_orphaned_event(parent_event) ||
8161             !atomic_long_inc_not_zero(&parent_event->refcount)) {
8162                 free_event(child_event);
8163                 return NULL;
8164         }
8165
8166         get_ctx(child_ctx);
8167
8168         /*
8169          * Make the child state follow the state of the parent event,
8170          * not its attr.disabled bit.  We hold the parent's mutex,
8171          * so we won't race with perf_event_{en, dis}able_family.
8172          */
8173         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
8174                 child_event->state = PERF_EVENT_STATE_INACTIVE;
8175         else
8176                 child_event->state = PERF_EVENT_STATE_OFF;
8177
8178         if (parent_event->attr.freq) {
8179                 u64 sample_period = parent_event->hw.sample_period;
8180                 struct hw_perf_event *hwc = &child_event->hw;
8181
8182                 hwc->sample_period = sample_period;
8183                 hwc->last_period   = sample_period;
8184
8185                 local64_set(&hwc->period_left, sample_period);
8186         }
8187
8188         child_event->ctx = child_ctx;
8189         child_event->overflow_handler = parent_event->overflow_handler;
8190         child_event->overflow_handler_context
8191                 = parent_event->overflow_handler_context;
8192
8193         /*
8194          * Precalculate sample_data sizes
8195          */
8196         perf_event__header_size(child_event);
8197         perf_event__id_header_size(child_event);
8198
8199         /*
8200          * Link it up in the child's context:
8201          */
8202         raw_spin_lock_irqsave(&child_ctx->lock, flags);
8203         add_event_to_ctx(child_event, child_ctx);
8204         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8205
8206         /*
8207          * Link this into the parent event's child list
8208          */
8209         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8210         mutex_lock(&parent_event->child_mutex);
8211         list_add_tail(&child_event->child_list, &parent_event->child_list);
8212         mutex_unlock(&parent_event->child_mutex);
8213
8214         return child_event;
8215 }
8216
8217 static int inherit_group(struct perf_event *parent_event,
8218               struct task_struct *parent,
8219               struct perf_event_context *parent_ctx,
8220               struct task_struct *child,
8221               struct perf_event_context *child_ctx)
8222 {
8223         struct perf_event *leader;
8224         struct perf_event *sub;
8225         struct perf_event *child_ctr;
8226
8227         leader = inherit_event(parent_event, parent, parent_ctx,
8228                                  child, NULL, child_ctx);
8229         if (IS_ERR(leader))
8230                 return PTR_ERR(leader);
8231         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
8232                 child_ctr = inherit_event(sub, parent, parent_ctx,
8233                                             child, leader, child_ctx);
8234                 if (IS_ERR(child_ctr))
8235                         return PTR_ERR(child_ctr);
8236         }
8237         return 0;
8238 }
8239
8240 static int
8241 inherit_task_group(struct perf_event *event, struct task_struct *parent,
8242                    struct perf_event_context *parent_ctx,
8243                    struct task_struct *child, int ctxn,
8244                    int *inherited_all)
8245 {
8246         int ret;
8247         struct perf_event_context *child_ctx;
8248
8249         if (!event->attr.inherit) {
8250                 *inherited_all = 0;
8251                 return 0;
8252         }
8253
8254         child_ctx = child->perf_event_ctxp[ctxn];
8255         if (!child_ctx) {
8256                 /*
8257                  * This is executed from the parent task context, so
8258                  * inherit events that have been marked for cloning.
8259                  * First allocate and initialize a context for the
8260                  * child.
8261                  */
8262
8263                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
8264                 if (!child_ctx)
8265                         return -ENOMEM;
8266
8267                 child->perf_event_ctxp[ctxn] = child_ctx;
8268         }
8269
8270         ret = inherit_group(event, parent, parent_ctx,
8271                             child, child_ctx);
8272
8273         if (ret)
8274                 *inherited_all = 0;
8275
8276         return ret;
8277 }
8278
8279 /*
8280  * Initialize the perf_event context in task_struct
8281  */
8282 static int perf_event_init_context(struct task_struct *child, int ctxn)
8283 {
8284         struct perf_event_context *child_ctx, *parent_ctx;
8285         struct perf_event_context *cloned_ctx;
8286         struct perf_event *event;
8287         struct task_struct *parent = current;
8288         int inherited_all = 1;
8289         unsigned long flags;
8290         int ret = 0;
8291
8292         if (likely(!parent->perf_event_ctxp[ctxn]))
8293                 return 0;
8294
8295         /*
8296          * If the parent's context is a clone, pin it so it won't get
8297          * swapped under us.
8298          */
8299         parent_ctx = perf_pin_task_context(parent, ctxn);
8300         if (!parent_ctx)
8301                 return 0;
8302
8303         /*
8304          * No need to check if parent_ctx != NULL here; since we saw
8305          * it non-NULL earlier, the only reason for it to become NULL
8306          * is if we exit, and since we're currently in the middle of
8307          * a fork we can't be exiting at the same time.
8308          */
8309
8310         /*
8311          * Lock the parent list. No need to lock the child - not PID
8312          * hashed yet and not running, so nobody can access it.
8313          */
8314         mutex_lock(&parent_ctx->mutex);
8315
8316         /*
8317          * We dont have to disable NMIs - we are only looking at
8318          * the list, not manipulating it:
8319          */
8320         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8321                 ret = inherit_task_group(event, parent, parent_ctx,
8322                                          child, ctxn, &inherited_all);
8323                 if (ret)
8324                         break;
8325         }
8326
8327         /*
8328          * We can't hold ctx->lock when iterating the ->flexible_group list due
8329          * to allocations, but we need to prevent rotation because
8330          * rotate_ctx() will change the list from interrupt context.
8331          */
8332         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8333         parent_ctx->rotate_disable = 1;
8334         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8335
8336         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8337                 ret = inherit_task_group(event, parent, parent_ctx,
8338                                          child, ctxn, &inherited_all);
8339                 if (ret)
8340                         break;
8341         }
8342
8343         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8344         parent_ctx->rotate_disable = 0;
8345
8346         child_ctx = child->perf_event_ctxp[ctxn];
8347
8348         if (child_ctx && inherited_all) {
8349                 /*
8350                  * Mark the child context as a clone of the parent
8351                  * context, or of whatever the parent is a clone of.
8352                  *
8353                  * Note that if the parent is a clone, the holding of
8354                  * parent_ctx->lock avoids it from being uncloned.
8355                  */
8356                 cloned_ctx = parent_ctx->parent_ctx;
8357                 if (cloned_ctx) {
8358                         child_ctx->parent_ctx = cloned_ctx;
8359                         child_ctx->parent_gen = parent_ctx->parent_gen;
8360                 } else {
8361                         child_ctx->parent_ctx = parent_ctx;
8362                         child_ctx->parent_gen = parent_ctx->generation;
8363                 }
8364                 get_ctx(child_ctx->parent_ctx);
8365         }
8366
8367         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8368         mutex_unlock(&parent_ctx->mutex);
8369
8370         perf_unpin_context(parent_ctx);
8371         put_ctx(parent_ctx);
8372
8373         return ret;
8374 }
8375
8376 /*
8377  * Initialize the perf_event context in task_struct
8378  */
8379 int perf_event_init_task(struct task_struct *child)
8380 {
8381         int ctxn, ret;
8382
8383         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
8384         mutex_init(&child->perf_event_mutex);
8385         INIT_LIST_HEAD(&child->perf_event_list);
8386
8387         for_each_task_context_nr(ctxn) {
8388                 ret = perf_event_init_context(child, ctxn);
8389                 if (ret) {
8390                         perf_event_free_task(child);
8391                         return ret;
8392                 }
8393         }
8394
8395         return 0;
8396 }
8397
8398 static void __init perf_event_init_all_cpus(void)
8399 {
8400         struct swevent_htable *swhash;
8401         int cpu;
8402
8403         for_each_possible_cpu(cpu) {
8404                 swhash = &per_cpu(swevent_htable, cpu);
8405                 mutex_init(&swhash->hlist_mutex);
8406                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
8407         }
8408 }
8409
8410 static void perf_event_init_cpu(int cpu)
8411 {
8412         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8413
8414         mutex_lock(&swhash->hlist_mutex);
8415         swhash->online = true;
8416         if (swhash->hlist_refcount > 0) {
8417                 struct swevent_hlist *hlist;
8418
8419                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
8420                 WARN_ON(!hlist);
8421                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8422         }
8423         mutex_unlock(&swhash->hlist_mutex);
8424 }
8425
8426 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
8427 static void __perf_event_exit_context(void *__info)
8428 {
8429         struct remove_event re = { .detach_group = true };
8430         struct perf_event_context *ctx = __info;
8431
8432         rcu_read_lock();
8433         list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8434                 __perf_remove_from_context(&re);
8435         rcu_read_unlock();
8436 }
8437
8438 static void perf_event_exit_cpu_context(int cpu)
8439 {
8440         struct perf_event_context *ctx;
8441         struct pmu *pmu;
8442         int idx;
8443
8444         idx = srcu_read_lock(&pmus_srcu);
8445         list_for_each_entry_rcu(pmu, &pmus, entry) {
8446                 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
8447
8448                 mutex_lock(&ctx->mutex);
8449                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
8450                 mutex_unlock(&ctx->mutex);
8451         }
8452         srcu_read_unlock(&pmus_srcu, idx);
8453 }
8454
8455 static void perf_event_exit_cpu(int cpu)
8456 {
8457         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8458
8459         perf_event_exit_cpu_context(cpu);
8460
8461         mutex_lock(&swhash->hlist_mutex);
8462         swhash->online = false;
8463         swevent_hlist_release(swhash);
8464         mutex_unlock(&swhash->hlist_mutex);
8465 }
8466 #else
8467 static inline void perf_event_exit_cpu(int cpu) { }
8468 #endif
8469
8470 static int
8471 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
8472 {
8473         int cpu;
8474
8475         for_each_online_cpu(cpu)
8476                 perf_event_exit_cpu(cpu);
8477
8478         return NOTIFY_OK;
8479 }
8480
8481 /*
8482  * Run the perf reboot notifier at the very last possible moment so that
8483  * the generic watchdog code runs as long as possible.
8484  */
8485 static struct notifier_block perf_reboot_notifier = {
8486         .notifier_call = perf_reboot,
8487         .priority = INT_MIN,
8488 };
8489
8490 static int
8491 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
8492 {
8493         unsigned int cpu = (long)hcpu;
8494
8495         switch (action & ~CPU_TASKS_FROZEN) {
8496
8497         case CPU_UP_PREPARE:
8498         case CPU_DOWN_FAILED:
8499                 perf_event_init_cpu(cpu);
8500                 break;
8501
8502         case CPU_UP_CANCELED:
8503         case CPU_DOWN_PREPARE:
8504                 perf_event_exit_cpu(cpu);
8505                 break;
8506         default:
8507                 break;
8508         }
8509
8510         return NOTIFY_OK;
8511 }
8512
8513 void __init perf_event_init(void)
8514 {
8515         int ret;
8516
8517         idr_init(&pmu_idr);
8518
8519         perf_event_init_all_cpus();
8520         init_srcu_struct(&pmus_srcu);
8521         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
8522         perf_pmu_register(&perf_cpu_clock, NULL, -1);
8523         perf_pmu_register(&perf_task_clock, NULL, -1);
8524         perf_tp_register();
8525         perf_cpu_notifier(perf_cpu_notify);
8526         register_reboot_notifier(&perf_reboot_notifier);
8527
8528         ret = init_hw_breakpoint();
8529         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
8530
8531         /* do not patch jump label more than once per second */
8532         jump_label_rate_limit(&perf_sched_events, HZ);
8533
8534         /*
8535          * Build time assertion that we keep the data_head at the intended
8536          * location.  IOW, validation we got the __reserved[] size right.
8537          */
8538         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
8539                      != 1024);
8540 }
8541
8542 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8543                               char *page)
8544 {
8545         struct perf_pmu_events_attr *pmu_attr =
8546                 container_of(attr, struct perf_pmu_events_attr, attr);
8547
8548         if (pmu_attr->event_str)
8549                 return sprintf(page, "%s\n", pmu_attr->event_str);
8550
8551         return 0;
8552 }
8553
8554 static int __init perf_event_sysfs_init(void)
8555 {
8556         struct pmu *pmu;
8557         int ret;
8558
8559         mutex_lock(&pmus_lock);
8560
8561         ret = bus_register(&pmu_bus);
8562         if (ret)
8563                 goto unlock;
8564
8565         list_for_each_entry(pmu, &pmus, entry) {
8566                 if (!pmu->name || pmu->type < 0)
8567                         continue;
8568
8569                 ret = pmu_dev_alloc(pmu);
8570                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
8571         }
8572         pmu_bus_running = 1;
8573         ret = 0;
8574
8575 unlock:
8576         mutex_unlock(&pmus_lock);
8577
8578         return ret;
8579 }
8580 device_initcall(perf_event_sysfs_init);
8581
8582 #ifdef CONFIG_CGROUP_PERF
8583 static struct cgroup_subsys_state *
8584 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8585 {
8586         struct perf_cgroup *jc;
8587
8588         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
8589         if (!jc)
8590                 return ERR_PTR(-ENOMEM);
8591
8592         jc->info = alloc_percpu(struct perf_cgroup_info);
8593         if (!jc->info) {
8594                 kfree(jc);
8595                 return ERR_PTR(-ENOMEM);
8596         }
8597
8598         return &jc->css;
8599 }
8600
8601 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
8602 {
8603         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
8604
8605         free_percpu(jc->info);
8606         kfree(jc);
8607 }
8608
8609 static int __perf_cgroup_move(void *info)
8610 {
8611         struct task_struct *task = info;
8612         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
8613         return 0;
8614 }
8615
8616 static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8617                                struct cgroup_taskset *tset)
8618 {
8619         struct task_struct *task;
8620
8621         cgroup_taskset_for_each(task, tset)
8622                 task_function_call(task, __perf_cgroup_move, task);
8623 }
8624
8625 static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8626                              struct cgroup_subsys_state *old_css,
8627                              struct task_struct *task)
8628 {
8629         /*
8630          * cgroup_exit() is called in the copy_process() failure path.
8631          * Ignore this case since the task hasn't ran yet, this avoids
8632          * trying to poke a half freed task state from generic code.
8633          */
8634         if (!(task->flags & PF_EXITING))
8635                 return;
8636
8637         task_function_call(task, __perf_cgroup_move, task);
8638 }
8639
8640 struct cgroup_subsys perf_event_cgrp_subsys = {
8641         .css_alloc      = perf_cgroup_css_alloc,
8642         .css_free       = perf_cgroup_css_free,
8643         .exit           = perf_cgroup_exit,
8644         .attach         = perf_cgroup_attach,
8645 };
8646 #endif /* CONFIG_CGROUP_PERF */