drivers/oprofile/cpu_buffer.c

   1 /**
   2  * @file cpu_buffer.c
   3  *
   4  * @remark Copyright 2002-2009 OProfile authors
   5  * @remark Read the file COPYING
   6  *
   7  * @author John Levon <[email protected]>
   8  * @author Barry Kasindorf <[email protected]>
   9  * @author Robert Richter <[email protected]>
  10  *
  11  * Each CPU has a local buffer that stores PC value/event
  12  * pairs. We also log context switches when we notice them.
  13  * Eventually each CPU's buffer is processed into the global
  14  * event buffer by sync_buffer().
  15  *
  16  * We use a local buffer for two reasons: an NMI or similar
  17  * interrupt cannot synchronise, and high sampling rates
  18  * would lead to catastrophic global synchronisation if
  19  * a global buffer was used.
  20  */
  21
  22 #include <linux/sched.h>
  23 #include <linux/oprofile.h>
  24 #include <linux/errno.h>
  25
  26 #include <asm/ptrace.h>
  27
  28 #include "event_buffer.h"
  29 #include "cpu_buffer.h"
  30 #include "buffer_sync.h"
  31 #include "oprof.h"
  32
  33 #define OP_BUFFER_FLAGS 0
  34
  35 static struct ring_buffer *op_ring_buffer;
  36 DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
  37
  38 static void wq_sync_buffer(struct work_struct *work);
  39
  40 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
  41 static int work_enabled;
  42
  43 unsigned long oprofile_get_cpu_buffer_size(void)
  44 {
  45         return oprofile_cpu_buffer_size;
  46 }
  47
  48 void oprofile_cpu_buffer_inc_smpl_lost(void)
  49 {
  50         struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
  51
  52         cpu_buf->sample_lost_overflow++;
  53 }
  54
  55 void free_cpu_buffers(void)
  56 {
  57         if (op_ring_buffer)
  58                 ring_buffer_free(op_ring_buffer);
  59         op_ring_buffer = NULL;
  60 }
  61
  62 #define RB_EVENT_HDR_SIZE 4
  63
  64 int alloc_cpu_buffers(void)
  65 {
  66         int i;
  67
  68         unsigned long buffer_size = oprofile_cpu_buffer_size;
  69         unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
  70                                                  RB_EVENT_HDR_SIZE);
  71
  72         op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
  73         if (!op_ring_buffer)
  74                 goto fail;
  75
  76         for_each_possible_cpu(i) {
  77                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
  78
  79                 b->last_task = NULL;
  80                 b->last_is_kernel = -1;
  81                 b->tracing = 0;
  82                 b->buffer_size = buffer_size;
  83                 b->sample_received = 0;
  84                 b->sample_lost_overflow = 0;
  85                 b->backtrace_aborted = 0;
  86                 b->sample_invalid_eip = 0;
  87                 b->cpu = i;
  88                 INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
  89         }
  90         return 0;
  91
  92 fail:
  93         free_cpu_buffers();
  94         return -ENOMEM;
  95 }
  96
  97 void start_cpu_work(void)
  98 {
  99         int i;
 100
 101         work_enabled = 1;
 102
 103         for_each_online_cpu(i) {
 104                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
 105
 106                 /*
 107                  * Spread the work by 1 jiffy per cpu so they dont all
 108                  * fire at once.
 109                  */
 110                 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
 111         }
 112 }
 113
 114 void end_cpu_work(void)
 115 {
 116         work_enabled = 0;
 117 }
 118
 119 void flush_cpu_work(void)
 120 {
 121         int i;
 122
 123         for_each_online_cpu(i) {
 124                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
 125
 126                 /* these works are per-cpu, no need for flush_sync */
 127                 flush_delayed_work(&b->work);
 128         }
 129 }
 130
 131 /*
 132  * This function prepares the cpu buffer to write a sample.
 133  *
 134  * Struct op_entry is used during operations on the ring buffer while
 135  * struct op_sample contains the data that is stored in the ring
 136  * buffer. Struct entry can be uninitialized. The function reserves a
 137  * data array that is specified by size. Use
 138  * op_cpu_buffer_write_commit() after preparing the sample. In case of
 139  * errors a null pointer is returned, otherwise the pointer to the
 140  * sample.
 141  *
 142  */
 143 struct op_sample
 144 *op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
 145 {
 146         entry->event = ring_buffer_lock_reserve
 147                 (op_ring_buffer, sizeof(struct op_sample) +
 148                  size * sizeof(entry->sample->data[0]));
 149         if (!entry->event)
 150                 return NULL;
 151         entry->sample = ring_buffer_event_data(entry->event);
 152         entry->size = size;
 153         entry->data = entry->sample->data;
 154
 155         return entry->sample;
 156 }
 157
 158 int op_cpu_buffer_write_commit(struct op_entry *entry)
 159 {
 160         return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
 161 }
 162
 163 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 164 {
 165         struct ring_buffer_event *e;
 166         e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL);
 167         if (!e)
 168                 return NULL;
 169
 170         entry->event = e;
 171         entry->sample = ring_buffer_event_data(e);
 172         entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
 173                 / sizeof(entry->sample->data[0]);
 174         entry->data = entry->sample->data;
 175         return entry->sample;
 176 }
 177
 178 unsigned long op_cpu_buffer_entries(int cpu)
 179 {
 180         return ring_buffer_entries_cpu(op_ring_buffer, cpu);
 181 }
 182
 183 static int
 184 op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
 185             int is_kernel, struct task_struct *task)
 186 {
 187         struct op_entry entry;
 188         struct op_sample *sample;
 189         unsigned long flags;
 190         int size;
 191
 192         flags = 0;
 193
 194         if (backtrace)
 195                 flags |= TRACE_BEGIN;
 196
 197         /* notice a switch from user->kernel or vice versa */
 198         is_kernel = !!is_kernel;
 199         if (cpu_buf->last_is_kernel != is_kernel) {
 200                 cpu_buf->last_is_kernel = is_kernel;
 201                 flags |= KERNEL_CTX_SWITCH;
 202                 if (is_kernel)
 203                         flags |= IS_KERNEL;
 204         }
 205
 206         /* notice a task switch */
 207         if (cpu_buf->last_task != task) {
 208                 cpu_buf->last_task = task;
 209                 flags |= USER_CTX_SWITCH;
 210         }
 211
 212         if (!flags)
 213                 /* nothing to do */
 214                 return 0;
 215
 216         if (flags & USER_CTX_SWITCH)
 217                 size = 1;
 218         else
 219                 size = 0;
 220
 221         sample = op_cpu_buffer_write_reserve(&entry, size);
 222         if (!sample)
 223                 return -ENOMEM;
 224
 225         sample->eip = ESCAPE_CODE;
 226         sample->event = flags;
 227
 228         if (size)
 229                 op_cpu_buffer_add_data(&entry, (unsigned long)task);
 230
 231         op_cpu_buffer_write_commit(&entry);
 232
 233         return 0;
 234 }
 235
 236 static inline int
 237 op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
 238               unsigned long pc, unsigned long event)
 239 {
 240         struct op_entry entry;
 241         struct op_sample *sample;
 242
 243         sample = op_cpu_buffer_write_reserve(&entry, 0);
 244         if (!sample)
 245                 return -ENOMEM;
 246
 247         sample->eip = pc;
 248         sample->event = event;
 249
 250         return op_cpu_buffer_write_commit(&entry);
 251 }
 252
 253 /*
 254  * This must be safe from any context.
 255  *
 256  * is_kernel is needed because on some architectures you cannot
 257  * tell if you are in kernel or user space simply by looking at
 258  * pc. We tag this in the buffer by generating kernel enter/exit
 259  * events whenever is_kernel changes
 260  */
 261 static int
 262 log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
 263            unsigned long backtrace, int is_kernel, unsigned long event,
 264            struct task_struct *task)
 265 {
 266         struct task_struct *tsk = task ? task : current;
 267         cpu_buf->sample_received++;
 268
 269         if (pc == ESCAPE_CODE) {
 270                 cpu_buf->sample_invalid_eip++;
 271                 return 0;
 272         }
 273
 274         if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
 275                 goto fail;
 276
 277         if (op_add_sample(cpu_buf, pc, event))
 278                 goto fail;
 279
 280         return 1;
 281
 282 fail:
 283         cpu_buf->sample_lost_overflow++;
 284         return 0;
 285 }
 286
 287 static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
 288 {
 289         cpu_buf->tracing = 1;
 290 }
 291
 292 static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
 293 {
 294         cpu_buf->tracing = 0;
 295 }
 296
 297 static inline void
 298 __oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 299                           unsigned long event, int is_kernel,
 300                           struct task_struct *task)
 301 {
 302         struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 303         unsigned long backtrace = oprofile_backtrace_depth;
 304
 305         /*
 306          * if log_sample() fail we can't backtrace since we lost the
 307          * source of this event
 308          */
 309         if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, task))
 310                 /* failed */
 311                 return;
 312
 313         if (!backtrace)
 314                 return;
 315
 316         oprofile_begin_trace(cpu_buf);
 317         oprofile_ops.backtrace(regs, backtrace);
 318         oprofile_end_trace(cpu_buf);
 319 }
 320
 321 void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
 322                                 unsigned long event, int is_kernel,
 323                                 struct task_struct *task)
 324 {
 325         __oprofile_add_ext_sample(pc, regs, event, is_kernel, task);
 326 }
 327
 328 void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 329                              unsigned long event, int is_kernel)
 330 {
 331         __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
 332 }
 333
 334 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 335 {
 336         int is_kernel;
 337         unsigned long pc;
 338
 339         if (likely(regs)) {
 340                 is_kernel = !user_mode(regs);
 341                 pc = profile_pc(regs);
 342         } else {
 343                 is_kernel = 0;    /* This value will not be used */
 344                 pc = ESCAPE_CODE; /* as this causes an early return. */
 345         }
 346
 347         __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
 348 }
 349
 350 /*
 351  * Add samples with data to the ring buffer.
 352  *
 353  * Use oprofile_add_data(&entry, val) to add data and
 354  * oprofile_write_commit(&entry) to commit the sample.
 355  */
 356 void
 357 oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
 358                        unsigned long pc, int code, int size)
 359 {
 360         struct op_sample *sample;
 361         int is_kernel = !user_mode(regs);
 362         struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 363
 364         cpu_buf->sample_received++;
 365
 366         /* no backtraces for samples with data */
 367         if (op_add_code(cpu_buf, 0, is_kernel, current))
 368                 goto fail;
 369
 370         sample = op_cpu_buffer_write_reserve(entry, size + 2);
 371         if (!sample)
 372                 goto fail;
 373         sample->eip = ESCAPE_CODE;
 374         sample->event = 0;              /* no flags */
 375
 376         op_cpu_buffer_add_data(entry, code);
 377         op_cpu_buffer_add_data(entry, pc);
 378
 379         return;
 380
 381 fail:
 382         entry->event = NULL;
 383         cpu_buf->sample_lost_overflow++;
 384 }
 385
 386 int oprofile_add_data(struct op_entry *entry, unsigned long val)
 387 {
 388         if (!entry->event)
 389                 return 0;
 390         return op_cpu_buffer_add_data(entry, val);
 391 }
 392
 393 int oprofile_add_data64(struct op_entry *entry, u64 val)
 394 {
 395         if (!entry->event)
 396                 return 0;
 397         if (op_cpu_buffer_get_size(entry) < 2)
 398                 /*
 399                  * the function returns 0 to indicate a too small
 400                  * buffer, even if there is some space left
 401                  */
 402                 return 0;
 403         if (!op_cpu_buffer_add_data(entry, (u32)val))
 404                 return 0;
 405         return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
 406 }
 407
 408 int oprofile_write_commit(struct op_entry *entry)
 409 {
 410         if (!entry->event)
 411                 return -EINVAL;
 412         return op_cpu_buffer_write_commit(entry);
 413 }
 414
 415 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 416 {
 417         struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 418         log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
 419 }
 420
 421 void oprofile_add_trace(unsigned long pc)
 422 {
 423         struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
 424
 425         if (!cpu_buf->tracing)
 426                 return;
 427
 428         /*
 429          * broken frame can give an eip with the same value as an
 430          * escape code, abort the trace if we get it
 431          */
 432         if (pc == ESCAPE_CODE)
 433                 goto fail;
 434
 435         if (op_add_sample(cpu_buf, pc, 0))
 436                 goto fail;
 437
 438         return;
 439 fail:
 440         cpu_buf->tracing = 0;
 441         cpu_buf->backtrace_aborted++;
 442         return;
 443 }
 444
 445 /*
 446  * This serves to avoid cpu buffer overflow, and makes sure
 447  * the task mortuary progresses
 448  *
 449  * By using schedule_delayed_work_on and then schedule_delayed_work
 450  * we guarantee this will stay on the correct cpu
 451  */
 452 static void wq_sync_buffer(struct work_struct *work)
 453 {
 454         struct oprofile_cpu_buffer *b =
 455                 container_of(work, struct oprofile_cpu_buffer, work.work);
 456         if (b->cpu != smp_processor_id() && !cpu_online(b->cpu)) {
 457                 cancel_delayed_work(&b->work);
 458                 return;
 459         }
 460         sync_buffer(b->cpu);
 461
 462         /* don't re-add the work if we're shutting down */
 463         if (work_enabled)
 464                 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
 465 }