VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 31
- EXTRAVERSION = -rc1
+ EXTRAVERSION = -rc6
NAME = Man-Eating Seals of Antiquity
# *DOCUMENTATION*
endif
srctree := $(if $(KBUILD_SRC),$(KBUILD_SRC),$(CURDIR))
- TOPDIR := $(srctree)
- # FIXME - TOPDIR is obsolete, use srctree/objtree
objtree := $(CURDIR)
src := $(srctree)
obj := $(objtree)
VPATH := $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
- export srctree objtree VPATH TOPDIR
+ export srctree objtree VPATH
# SUBARCH tells the usermode build what the underlying arch is. That is set
MODFLAGS = -DMODULE
CFLAGS_MODULE = $(MODFLAGS)
AFLAGS_MODULE = $(MODFLAGS)
-LDFLAGS_MODULE =
+LDFLAGS_MODULE = -T $(srctree)/scripts/module-common.lds
CFLAGS_KERNEL =
AFLAGS_KERNEL =
CFLAGS_GCOV = -fprofile-arcs -ftest-coverage
KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-fno-strict-aliasing -fno-common \
- -Werror-implicit-function-declaration
+ -Werror-implicit-function-declaration \
+ -Wno-format-security \
+ -fno-delete-null-pointer-checks
KBUILD_AFLAGS := -D__ASSEMBLY__
# Read KERNELRELEASE from include/config/kernel.release (if it exists)
KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
# disable invalid "can't wrap" optimizations for signed / pointers
- KBUILD_CFLAGS += $(call cc-option,-fwrapv)
+ KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
# revert to pre-gcc-4.4 behaviour of .eh_frame
KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm)
_edata = .; /* End of data section */
}
- .data.init_task : { INIT_TASK(THREAD_SIZE); }
+ .data.init_task : { INIT_TASK_DATA(THREAD_SIZE); }
/* might get freed after init */
. = ALIGN(PAGE_SIZE);
__init_end = .;
/* freed after init ends here */
- BSS(4)
+ BSS_SECTION(0, PAGE_SIZE, 4)
_end = . ;
. = ALIGN(PAGE_SIZE);
pg0 = .;
- /* Sections to be discarded */
- /DISCARD/ : {
- EXIT_CALL
- }
-
STABS_DEBUG
DWARF_DEBUG
+
+ /* Sections to be discarded */
+ DISCARDS
}
#endif
}
-static size_t pcpur_size __initdata;
-static void **pcpur_ptrs __initdata;
-
-static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
-{
- size_t off = (size_t)pageno << PAGE_SHIFT;
-
- if (off >= pcpur_size)
- return NULL;
-
- return virt_to_page(pcpur_ptrs[cpu] + off);
-}
-
#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
static void __init pcpu_map_range(unsigned long start, unsigned long end,
size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
static struct vm_struct vm;
unsigned long delta, cpu;
- size_t pcpu_unit_size;
+ size_t size_sum, pcpu_unit_size;
size_t ptrs_size;
+ void **ptrs;
- pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- PERCPU_DYNAMIC_RESERVE);
- dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
+ size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE);
+ dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
- ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
- ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0]));
- pcpur_ptrs = alloc_bootmem(ptrs_size);
++ ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
+ ptrs = alloc_bootmem(ptrs_size);
for_each_possible_cpu(cpu) {
- pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
- PCPU_CHUNK_SIZE);
+ ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
+ PCPU_CHUNK_SIZE);
- free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
- PCPU_CHUNK_SIZE - pcpur_size);
+ free_bootmem(__pa(ptrs[cpu] + size_sum),
+ PCPU_CHUNK_SIZE - size_sum);
- memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+ memcpy(ptrs[cpu], __per_cpu_load, static_size);
}
/* allocate address and map */
vm.flags = VM_ALLOC;
- vm.size = num_possible_cpus() * PCPU_CHUNK_SIZE;
+ vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE;
vm_area_register_early(&vm, PCPU_CHUNK_SIZE);
for_each_possible_cpu(cpu) {
start += cpu * PCPU_CHUNK_SIZE;
end = start + PCPU_CHUNK_SIZE;
- pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
+ pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
}
- pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+ pcpu_unit_size = pcpu_setup_first_chunk(static_size,
PERCPU_MODULE_RESERVE, dyn_size,
PCPU_CHUNK_SIZE, vm.addr, NULL);
- free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+ free_bootmem(__pa(ptrs), ptrs_size);
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
+ select HAVE_PERF_COUNTERS if (!M386 && !M486)
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
config HAVE_SETUP_PER_CPU_AREA
def_bool y
-config HAVE_DYNAMIC_PER_CPU_AREA
- def_bool y
-
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
- select HAVE_PERF_COUNTERS if (!M386 && !M486)
config X86_IO_APIC
def_bool y
recommended you say N here while the DMAR code remains
experimental.
+ config DMAR_BROKEN_GFX_WA
+ def_bool n
+ prompt "Workaround broken graphics drivers (going away soon)"
+ depends on DMAR
+ ---help---
+ Current Graphics drivers tend to use physical address
+ for DMA and avoid using DMA APIs. Setting this config
+ option permits the IOMMU driver to set a unity map for
+ all the OS-visible memory. Hence the driver can continue
+ to use physical addresses for DMA, at least until this
+ option is removed in the 2.6.32 kernel.
+
config DMAR_FLOPPY_WA
def_bool y
depends on DMAR
m->cs, m->ip);
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
- printk("\n");
+ printk(KERN_CONT "\n");
}
printk(KERN_EMERG "TSC %llx ", m->tsc);
if (m->addr)
- printk("ADDR %llx ", m->addr);
+ printk(KERN_CONT "ADDR %llx ", m->addr);
if (m->misc)
- printk("MISC %llx ", m->misc);
- printk("\n");
+ printk(KERN_CONT "MISC %llx ", m->misc);
+ printk(KERN_CONT "\n");
printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid,
m->apicid);
static void print_mce_head(void)
{
- printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
+ printk(KERN_EMERG "\nHARDWARE ERROR\n");
}
static void print_mce_tail(void)
{
printk(KERN_EMERG "This is not a software problem!\n"
- KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+ "Run through mcelog --ascii to decode and contact your hardware vendor\n");
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
*/
static int check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mcheck_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(next_interval);
+ n = &__get_cpu_var(mce_next_interval);
if (mce_notify_irq())
*n = max(*n/2, HZ/100);
else
static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(next_interval);
+ int *n = &__get_cpu_var(mce_next_interval);
if (mce_ignore_ce)
return;
const char *buf, size_t siz)
{
char *p;
- int len;
strncpy(mce_helper, buf, sizeof(mce_helper));
mce_helper[sizeof(mce_helper)-1] = 0;
- len = strlen(mce_helper);
p = strchr(mce_helper, '\n');
- if (*p)
+ if (p)
*p = 0;
- return len;
+ return strlen(mce_helper) + !!p;
}
static ssize_t set_ignore_ce(struct sys_device *s,
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
t->expires = round_jiffies(jiffies +
- __get_cpu_var(next_interval));
+ __get_cpu_var(mce_next_interval));
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
int num_counters_fixed;
int counter_bits;
u64 counter_mask;
+ int apic;
u64 max_period;
u64 intel_ctrl;
};
.enabled = 1,
};
+ /*
+ * Not sure about some of these
+ */
+ static const u64 p6_perfmon_event_map[] =
+ {
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
+ };
+
+ static u64 p6_pmu_event_map(int event)
+ {
+ return p6_perfmon_event_map[event];
+ }
+
+ /*
+ * Counter setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+ #define P6_NOP_COUNTER 0x0000002EULL
+
+ static u64 p6_pmu_raw_event(u64 event)
+ {
+ #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
+ #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+ #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
+ #define P6_EVNTSEL_INV_MASK 0x00800000ULL
+ #define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+
+ #define P6_EVNTSEL_MASK \
+ (P6_EVNTSEL_EVENT_MASK | \
+ P6_EVNTSEL_UNIT_MASK | \
+ P6_EVNTSEL_EDGE_MASK | \
+ P6_EVNTSEL_INV_MASK | \
+ P6_EVNTSEL_COUNTER_MASK)
+
+ return event & P6_EVNTSEL_MASK;
+ }
+
+
/*
* Intel PerfMon v3. Used on Core2 and later.
*/
static bool reserve_pmc_hardware(void)
{
+ #ifdef CONFIG_X86_LOCAL_APIC
int i;
if (nmi_watchdog == NMI_LOCAL_APIC)
if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
goto eventsel_fail;
}
+ #endif
return true;
+ #ifdef CONFIG_X86_LOCAL_APIC
eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu.eventsel + i);
enable_lapic_nmi_watchdog();
return false;
+ #endif
}
static void release_pmc_hardware(void)
{
+ #ifdef CONFIG_X86_LOCAL_APIC
int i;
for (i = 0; i < x86_pmu.num_counters; i++) {
if (nmi_watchdog == NMI_LOCAL_APIC)
enable_lapic_nmi_watchdog();
+ #endif
}
static void hw_perf_counter_destroy(struct perf_counter *counter)
{
struct perf_counter_attr *attr = &counter->attr;
struct hw_perf_counter *hwc = &counter->hw;
+ u64 config;
int err;
if (!x86_pmu_initialized())
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
atomic64_set(&hwc->period_left, hwc->sample_period);
+ } else {
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * counters (user-space has to fall back and
+ * sample via a hrtimer based software counter):
+ */
+ if (!x86_pmu.apic)
+ return -EOPNOTSUPP;
}
counter->destroy = hw_perf_counter_destroy;
if (attr->config >= x86_pmu.max_events)
return -EINVAL;
+
/*
* The generic map:
*/
- hwc->config |= x86_pmu.event_map(attr->config);
+ config = x86_pmu.event_map(attr->config);
+
+ if (config == 0)
+ return -ENOENT;
+
+ if (config == -1LL)
+ return -EINVAL;
+
+ hwc->config |= config;
return 0;
}
+ static void p6_pmu_disable_all(void)
+ {
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ u64 val;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ barrier();
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+ }
+
static void intel_pmu_disable_all(void)
{
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
return x86_pmu.disable_all();
}
+ static void p6_pmu_enable_all(void)
+ {
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ unsigned long val;
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+ }
+
static void intel_pmu_enable_all(void)
{
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
barrier();
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_counter *counter = cpuc->counters[idx];
u64 val;
if (!test_bit(idx, cpuc->active_mask))
continue;
- rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
- if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
- continue;
+
+ val = counter->hw.config;
val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
}
static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
- int err;
- err = checking_wrmsrl(hwc->config_base + idx,
+ (void)checking_wrmsrl(hwc->config_base + idx,
hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
}
static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
- int err;
- err = checking_wrmsrl(hwc->config_base + idx,
- hwc->config);
+ (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
}
static inline void
{
int idx = __idx - X86_PMC_IDX_FIXED;
u64 ctrl_val, mask;
- int err;
mask = 0xfULL << (idx * 4);
rdmsrl(hwc->config_base, ctrl_val);
ctrl_val &= ~mask;
- err = checking_wrmsrl(hwc->config_base, ctrl_val);
+ (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+ }
+
+ static inline void
+ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+ {
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ u64 val = P6_NOP_COUNTER;
+
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + idx, val);
}
static inline void
x86_pmu_disable_counter(hwc, idx);
}
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
/*
* Set the next IRQ period, based on the hwc->period_left value.
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
- per_cpu(prev_left[idx], smp_processor_id()) = left;
+ per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
* The hw counter starts counting from this counter offset,
err = checking_wrmsrl(hwc->config_base, ctrl_val);
}
+ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+ {
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ u64 val;
+
+ val = hwc->config;
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + idx, val);
+ }
+
+
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
if (cpuc->enabled)
x86_pmu_enable_counter(hwc, idx);
- else
- x86_pmu_disable_counter(hwc, idx);
}
static int
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
- prev_left = per_cpu(prev_left[idx], cpu);
+ prev_left = per_cpu(pmc_prev_left[idx], cpu);
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
local_irq_restore(flags);
}
+ static int p6_pmu_handle_irq(struct pt_regs *regs)
+ {
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ int idx, handled = 0;
+ u64 val;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ counter = cpuc->counters[idx];
+ hwc = &counter->hw;
+
+ val = x86_perf_counter_update(counter, hwc, idx);
+ if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ continue;
+
+ /*
+ * counter overflow
+ */
+ handled = 1;
+ data.period = counter->hw.last_period;
+
+ if (!x86_perf_counter_set_period(counter, hwc, idx))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ p6_pmu_disable_counter(hwc, idx);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+ }
/*
* This handler is triggered by the local APIC, so the APIC IRQ handling
{
struct perf_sample_data data;
struct cpu_hw_counters *cpuc;
- int bit, cpu, loops;
+ int bit, loops;
u64 ack, status;
data.regs = regs;
data.addr = 0;
- cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_counters, cpu);
+ cpuc = &__get_cpu_var(cpu_hw_counters);
perf_disable();
status = intel_pmu_get_status();
struct cpu_hw_counters *cpuc;
struct perf_counter *counter;
struct hw_perf_counter *hwc;
- int cpu, idx, handled = 0;
+ int idx, handled = 0;
u64 val;
data.regs = regs;
data.addr = 0;
- cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_counters, cpu);
+ cpuc = &__get_cpu_var(cpu_hw_counters);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask))
void set_perf_counter_pending(void)
{
+ #ifdef CONFIG_X86_LOCAL_APIC
apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+ #endif
}
void perf_counters_lapic_init(void)
{
- if (!x86_pmu_initialized())
+ #ifdef CONFIG_X86_LOCAL_APIC
+ if (!x86_pmu.apic || !x86_pmu_initialized())
return;
/*
* Always use NMI for PMU
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ #endif
}
static int __kprobes
regs = args->regs;
+ #ifdef CONFIG_X86_LOCAL_APIC
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ #endif
/*
* Can't rely on the handled return value to say it was our NMI, two
* counters could trigger 'simultaneously' raising two back-to-back NMIs.
.priority = 1
};
+ static struct x86_pmu p6_pmu = {
+ .name = "p6",
+ .handle_irq = p6_pmu_handle_irq,
+ .disable_all = p6_pmu_disable_all,
+ .enable_all = p6_pmu_enable_all,
+ .enable = p6_pmu_enable_counter,
+ .disable = p6_pmu_disable_counter,
+ .eventsel = MSR_P6_EVNTSEL0,
+ .perfctr = MSR_P6_PERFCTR0,
+ .event_map = p6_pmu_event_map,
+ .raw_event = p6_pmu_raw_event,
+ .max_events = ARRAY_SIZE(p6_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 31) - 1,
+ .version = 0,
+ .num_counters = 2,
+ /*
+ * Counters have 40 bits implemented. However they are designed such
+ * that bits [32-39] are sign extensions of bit 31. As such the
+ * effective width of a counter for P6-like PMU is 32 bits only.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+ .counter_bits = 32,
+ .counter_mask = (1ULL << 32) - 1,
+ };
+
static struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
.event_map = intel_pmu_event_map,
.raw_event = intel_pmu_raw_event,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
/*
* Intel PMCs cannot be accessed sanely above 32 bit width,
* so we install an artificial 1<<31 period regardless of
.num_counters = 4,
.counter_bits = 48,
.counter_mask = (1ULL << 48) - 1,
+ .apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
};
+ static int p6_pmu_init(void)
+ {
+ switch (boot_cpu_data.x86_model) {
+ case 1:
+ case 3: /* Pentium Pro */
+ case 5:
+ case 6: /* Pentium II */
+ case 7:
+ case 8:
+ case 11: /* Pentium III */
+ break;
+ case 9:
+ case 13:
+ /* Pentium M */
+ break;
+ default:
+ pr_cont("unsupported p6 CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ x86_pmu = p6_pmu;
+
+ if (!cpu_has_apic) {
+ pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+ pr_info("no hardware sampling interrupt available.\n");
+ x86_pmu.apic = 0;
+ }
+
+ return 0;
+ }
+
static int intel_pmu_init(void)
{
union cpuid10_edx edx;
unsigned int ebx;
int version;
- if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ /* check for P6 processor family */
+ if (boot_cpu_data.x86 == 6) {
+ return p6_pmu_init();
+ } else {
return -ENODEV;
+ }
+ }
/*
* Check whether the Architectural PerfMon supports
entry->ip[entry->nr++] = ip;
}
-static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ static DEFINE_PER_CPU(int, in_nmi_frame);
static void
static int backtrace_stack(void *data, char *name)
{
- /* Process all stacks: */
+ per_cpu(in_nmi_frame, smp_processor_id()) =
+ x86_is_stack_id(NMI_STACK, name);
+
return 0;
}
{
struct perf_callchain_entry *entry = data;
+ if (per_cpu(in_nmi_frame, smp_processor_id()))
+ return;
+
if (reliable)
callchain_store(entry, addr);
}
struct perf_callchain_entry *entry;
if (in_nmi())
- entry = &__get_cpu_var(nmi_entry);
+ entry = &__get_cpu_var(pmc_nmi_entry);
else
- entry = &__get_cpu_var(irq_entry);
+ entry = &__get_cpu_var(pmc_irq_entry);
entry->nr = 0;
}
/*
- * Large page remap allocator
- *
- * This allocator uses PMD page as unit. A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk. Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk. The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
+ * Helpers for first chunk memory allocation
*/
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pcpul_ent {
- unsigned int cpu;
- void *ptr;
-};
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
+{
+ return pcpu_alloc_bootmem(cpu, size, size);
+}
-static size_t pcpul_size;
-static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+ free_bootmem(__pa(ptr), size);
+}
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+/*
+ * Large page remapping allocator
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static void __init pcpul_map(void *ptr, size_t size, void *addr)
{
- size_t off = (size_t)pageno << PAGE_SHIFT;
+ pmd_t *pmd, pmd_v;
- if (off >= pcpul_size)
- return NULL;
+ pmd = populate_extra_pmd((unsigned long)addr);
+ pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
+ set_pmd(pmd, pmd_v);
+}
- return virt_to_page(pcpul_map[cpu].ptr + off);
+static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
+{
+ if (early_cpu_to_node(from) == early_cpu_to_node(to))
+ return LOCAL_DISTANCE;
+ else
+ return REMOTE_DISTANCE;
}
static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
{
- size_t map_size, dyn_size;
- unsigned int cpu;
- int i, j;
+ size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+ size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t unit_map_size, unit_size;
+ int *unit_map;
+ int nr_units;
ssize_t ret;
- if (!chosen) {
- size_t vm_size = VMALLOC_END - VMALLOC_START;
- size_t tot_size = nr_cpu_ids * PMD_SIZE;
-
- /* on non-NUMA, embedding is better */
- if (!pcpu_need_numa())
- return -EINVAL;
-
- /* don't consume more than 20% of vmalloc area */
- if (tot_size > vm_size / 5) {
- pr_info("PERCPU: too large chunk size %zuMB for "
- "large page remap\n", tot_size >> 20);
- return -EINVAL;
- }
- }
+ /* on non-NUMA, embedding is better */
+ if (!chosen && !pcpu_need_numa())
+ return -EINVAL;
/* need PSE */
if (!cpu_has_pse) {
return -EINVAL;
}
- /*
- * Currently supports only single page. Supporting multiple
- * pages won't be too difficult if it ever becomes necessary.
- */
- pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- PERCPU_DYNAMIC_RESERVE);
- if (pcpul_size > PMD_SIZE) {
- pr_warning("PERCPU: static data is larger than large page, "
- "can't use large page\n");
- return -EINVAL;
- }
- dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
- /* allocate pointer array and alloc large pages */
- map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
- pcpul_map = alloc_bootmem(map_size);
-
- for_each_possible_cpu(cpu) {
- pcpul_map[cpu].cpu = cpu;
- pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
- PMD_SIZE);
- if (!pcpul_map[cpu].ptr) {
- pr_warning("PERCPU: failed to allocate large page "
- "for cpu%u\n", cpu);
- goto enomem;
- }
-
- /*
- * Only use pcpul_size bytes and give back the rest.
- *
- * Ingo: The 2MB up-rounding bootmem is needed to make
- * sure the partial 2MB page is still fully RAM - it's
- * not well-specified to have a PAT-incompatible area
- * (unmapped RAM, device memory, etc.) in that hole.
- */
- free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
- PMD_SIZE - pcpul_size);
-
- memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
+ /* allocate and build unit_map */
- unit_map_size = num_possible_cpus() * sizeof(int);
++ unit_map_size = nr_cpu_ids * sizeof(int);
+ unit_map = alloc_bootmem_nopanic(unit_map_size);
+ if (!unit_map) {
+ pr_warning("PERCPU: failed to allocate unit_map\n");
+ return -ENOMEM;
}
- /* allocate address and map */
- pcpul_vm.flags = VM_ALLOC;
- pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
- vm_area_register_early(&pcpul_vm, PMD_SIZE);
-
- for_each_possible_cpu(cpu) {
- pmd_t *pmd, pmd_v;
-
- pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
- cpu * PMD_SIZE);
- pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, pmd_v);
+ ret = pcpu_lpage_build_unit_map(static_size,
+ PERCPU_FIRST_CHUNK_RESERVE,
+ &dyn_size, &unit_size, PMD_SIZE,
+ unit_map, pcpu_lpage_cpu_distance);
+ if (ret < 0) {
+ pr_warning("PERCPU: failed to build unit_map\n");
+ goto out_free;
}
+ nr_units = ret;
- /* we're ready, commit */
- pr_info("PERCPU: Remapped at %p with large pages, static data "
- "%zu bytes\n", pcpul_vm.addr, static_size);
-
- ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- PMD_SIZE, pcpul_vm.addr, NULL);
-
- /* sort pcpul_map array for pcpu_lpage_remapped() */
- for (i = 0; i < nr_cpu_ids - 1; i++)
- for (j = i + 1; j < nr_cpu_ids; j++)
- if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- struct pcpul_ent tmp = pcpul_map[i];
- pcpul_map[i] = pcpul_map[j];
- pcpul_map[j] = tmp;
- }
-
- return ret;
-
-enomem:
- for_each_possible_cpu(cpu)
- if (pcpul_map[cpu].ptr)
- free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
- free_bootmem(__pa(pcpul_map), map_size);
- return -ENOMEM;
-}
+ /* do the parameters look okay? */
+ if (!chosen) {
+ size_t vm_size = VMALLOC_END - VMALLOC_START;
+ size_t tot_size = nr_units * unit_size;
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area. This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
-{
- void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
- unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- int left = 0, right = nr_cpu_ids - 1;
- int pos;
-
- /* pcpul in use at all? */
- if (!pcpul_map)
- return NULL;
-
- /* okay, perform binary search */
- while (left <= right) {
- pos = (left + right) / 2;
-
- if (pcpul_map[pos].ptr < pmd_addr)
- left = pos + 1;
- else if (pcpul_map[pos].ptr > pmd_addr)
- right = pos - 1;
- else {
- /* it shouldn't be in the area for the first chunk */
- WARN_ON(offset < pcpul_size);
-
- return pcpul_vm.addr +
- pcpul_map[pos].cpu * PMD_SIZE + offset;
+ /* don't consume more than 20% of vmalloc area */
+ if (tot_size > vm_size / 5) {
+ pr_info("PERCPU: too large chunk size %zuMB for "
+ "large page remap\n", tot_size >> 20);
+ ret = -EINVAL;
+ goto out_free;
}
}
- return NULL;
+ ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+ dyn_size, unit_size, PMD_SIZE,
+ unit_map, nr_units,
+ pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+out_free:
+ if (ret < 0)
+ free_bootmem(__pa(unit_map), unit_map_size);
+ return ret;
}
#else
static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
return -EINVAL;
return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
- reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
+ reserve - PERCPU_FIRST_CHUNK_RESERVE);
}
/*
- * 4k page allocator
+ * 4k allocator
*
- * This is the basic allocator. Static percpu area is allocated
- * page-by-page and most of initialization is done by the generic
- * setup function.
+ * Boring fallback 4k allocator. This allocator puts more pressure on
+ * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
*/
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_nr_static_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
- if (pageno < pcpu4k_nr_static_pages)
- return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
- return NULL;
-}
-
static void __init pcpu4k_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
static ssize_t __init setup_pcpu_4k(size_t static_size)
{
- size_t pages_size;
- unsigned int cpu;
- int i, j;
- ssize_t ret;
-
- pcpu4k_nr_static_pages = PFN_UP(static_size);
-
- /* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
- * sizeof(pcpu4k_pages[0]));
- pcpu4k_pages = alloc_bootmem(pages_size);
-
- /* allocate and copy */
- j = 0;
- for_each_possible_cpu(cpu)
- for (i = 0; i < pcpu4k_nr_static_pages; i++) {
- void *ptr;
-
- ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
- if (!ptr) {
- pr_warning("PERCPU: failed to allocate "
- "4k page for cpu%u\n", cpu);
- goto enomem;
- }
-
- memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
- pcpu4k_pages[j++] = virt_to_page(ptr);
- }
-
- /* we're ready, commit */
- pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
- pcpu4k_nr_static_pages, static_size);
-
- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, -1,
- -1, NULL, pcpu4k_populate_pte);
- goto out_free_ar;
-
-enomem:
- while (--j >= 0)
- free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
- ret = -ENOMEM;
-out_free_ar:
- free_bootmem(__pa(pcpu4k_pages), pages_size);
- return ret;
+ return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+ pcpu_fc_alloc, pcpu_fc_free,
+ pcpu4k_populate_pte);
}
/* for explicit first chunk allocator selection */
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
- per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+ per_cpu_offset(cpu) =
+ delta + pcpu_unit_map[cpu] * pcpu_unit_size;
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
_sdata = .;
DATA_DATA
CONSTRUCTORS
-
- #ifdef CONFIG_X86_64
- /* End of data section */
- _edata = .;
- #endif
} :data
#ifdef CONFIG_X86_32
.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
*(.data.read_mostly)
- #ifdef CONFIG_X86_32
/* End of data section */
_edata = .;
- #endif
}
#ifdef CONFIG_X86_64
_end = .;
}
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.eh_frame)
- *(.discard)
- }
-
STABS_DEBUG
DWARF_DEBUG
+
+ /* Sections to be discarded */
+ DISCARDS
+ /DISCARD/ : { *(.eh_frame) }
}
#ifdef CONFIG_X86_32
- ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
+ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
#else
/*
* Per-cpu symbols which need to be offset from __per_cpu_load
/*
* Build-time check on the image size:
*/
- ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
+ . = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
#ifdef CONFIG_SMP
- ASSERT((per_cpu__irq_stack_union == 0),
- "irq_stack_union is not at start of per-cpu area");
+ . = ASSERT((per_cpu__irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
#endif
#endif /* CONFIG_X86_32 */
#ifdef CONFIG_KEXEC
#include <asm/kexec.h>
- ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big")
+ . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big");
#endif
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
+#include <linux/percpu.h>
#include <asm/e820.h>
#include <asm/processor.h>
unsigned int level;
pte_t *kpte, old_pte;
- if (cpa->flags & CPA_PAGES_ARRAY)
- address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
- else if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ address = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
address = cpa->vaddr[cpa->curpage];
else
address = *cpa->vaddr;
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_PAGES_ARRAY)
- vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
- else if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ vaddr = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
vaddr = cpa->vaddr[cpa->curpage];
else
vaddr = *cpa->vaddr;
int _set_memory_wc(unsigned long addr, int numpages)
{
int ret;
+ unsigned long addr_copy = addr;
+
ret = change_page_attr_set(&addr, numpages,
__pgprot(_PAGE_CACHE_UC_MINUS), 0);
-
if (!ret) {
- ret = change_page_attr_set(&addr, numpages,
- __pgprot(_PAGE_CACHE_WC), 0);
+ ret = change_page_attr_set_clr(&addr_copy, numpages,
+ __pgprot(_PAGE_CACHE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, 0, NULL);
}
return ret;
}
int free_idx;
for (i = 0; i < addrinarray; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
goto err_out;
err_out:
free_idx = i;
for (i = 0; i < free_idx; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
free_memtype(start, end);
}
return retval;
for (i = 0; i < addrinarray; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
free_memtype(start, end);
}
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
-static DEFINE_PER_CPU(unsigned long, ioc_count);
+static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
static struct completion *ioc_gone;
static DEFINE_SPINLOCK(ioc_gone_lock);
cic = container_of(head, struct cfq_io_context, rcu_head);
kmem_cache_free(cfq_ioc_pool, cic);
- elv_ioc_count_dec(ioc_count);
+ elv_ioc_count_dec(cfq_ioc_count);
if (ioc_gone) {
/*
* complete ioc_gone and set it back to NULL
*/
spin_lock(&ioc_gone_lock);
- if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+ if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
complete(ioc_gone);
ioc_gone = NULL;
}
INIT_HLIST_NODE(&cic->cic_list);
cic->dtor = cfq_free_io_context;
cic->exit = cfq_exit_io_context;
- elv_ioc_count_inc(ioc_count);
+ elv_ioc_count_inc(cfq_ioc_count);
}
return cic;
goto queue_fail;
cfqq = cic_to_cfqq(cic, is_sync);
- if (!cfqq) {
+ if (!cfqq || cfqq == &cfqd->oom_cfqq) {
cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
cic_set_cfqq(cic, cfqq, is_sync);
}
* this also protects us from entering cfq_slab_kill() with
* pending RCU callbacks
*/
- if (elv_ioc_count_read(ioc_count))
+ if (elv_ioc_count_read(cfq_ioc_count))
wait_for_completion(&all_gone);
cfq_slab_kill();
}
unsigned int requested_freq;
int cpu;
unsigned int enable:1;
+ /*
+ * percpu mutex that serializes governor limit change with
+ * do_dbs_timer invocation. We do not want do_dbs_timer to run
+ * when user is changing the governor or limits.
+ */
+ struct mutex timer_mutex;
};
-static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
static unsigned int dbs_enable; /* number of CPUs using this policy */
/*
- * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
- * lock and dbs_mutex. cpu_hotplug lock should always be held before
- * dbs_mutex. If any function that can potentially take cpu_hotplug lock
- * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
- * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
- * is recursive for the same process. -Venki
- * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
- * would deadlock with cancel_delayed_work_sync(), which is needed for proper
- * raceless workqueue teardown.
+ * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+ * different CPUs. It protects dbs_enable in governor start/stop.
*/
static DEFINE_MUTEX(dbs_mutex);
void *data)
{
struct cpufreq_freqs *freq = data;
- struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info,
+ struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
freq->cpu);
struct cpufreq_policy *policy;
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu(j) {
struct cpu_dbs_info_s *dbs_info;
- dbs_info = &per_cpu(cpu_dbs_info, j);
+ dbs_info = &per_cpu(cs_cpu_dbs_info, j);
dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->prev_cpu_wall);
if (dbs_tuners_ins.ignore_nice)
cputime64_t cur_wall_time, cur_idle_time;
unsigned int idle_time, wall_time;
- j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
delay -= jiffies % delay;
- if (lock_policy_rwsem_write(cpu) < 0)
- return;
-
- if (!dbs_info->enable) {
- unlock_policy_rwsem_write(cpu);
- return;
- }
+ mutex_lock(&dbs_info->timer_mutex);
dbs_check_cpu(dbs_info);
queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay);
- unlock_policy_rwsem_write(cpu);
+ mutex_unlock(&dbs_info->timer_mutex);
}
static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
unsigned int j;
int rc;
- this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
switch (event) {
case CPUFREQ_GOV_START:
if ((!cpu_online(cpu)) || (!policy->cur))
return -EINVAL;
- if (this_dbs_info->enable) /* Already enabled */
- break;
-
mutex_lock(&dbs_mutex);
rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_info_s *j_dbs_info;
- j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
j_dbs_info->cur_policy = policy;
j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
this_dbs_info->down_skip = 0;
this_dbs_info->requested_freq = policy->cur;
+ mutex_init(&this_dbs_info->timer_mutex);
dbs_enable++;
/*
* Start the timerschedule work, when this governor
&dbs_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
- dbs_timer_init(this_dbs_info);
-
mutex_unlock(&dbs_mutex);
+ dbs_timer_init(this_dbs_info);
+
break;
case CPUFREQ_GOV_STOP:
- mutex_lock(&dbs_mutex);
dbs_timer_exit(this_dbs_info);
+
+ mutex_lock(&dbs_mutex);
sysfs_remove_group(&policy->kobj, &dbs_attr_group);
dbs_enable--;
+ mutex_destroy(&this_dbs_info->timer_mutex);
/*
* Stop the timerschedule work, when this governor
break;
case CPUFREQ_GOV_LIMITS:
- mutex_lock(&dbs_mutex);
+ mutex_lock(&this_dbs_info->timer_mutex);
if (policy->max < this_dbs_info->cur_policy->cur)
__cpufreq_driver_target(
this_dbs_info->cur_policy,
__cpufreq_driver_target(
this_dbs_info->cur_policy,
policy->min, CPUFREQ_RELATION_L);
- mutex_unlock(&dbs_mutex);
+ mutex_unlock(&this_dbs_info->timer_mutex);
break;
}
unsigned int freq_lo_jiffies;
unsigned int freq_hi_jiffies;
int cpu;
- unsigned int enable:1,
- sample_type:1;
+ unsigned int sample_type:1;
+ /*
+ * percpu mutex that serializes governor limit change with
+ * do_dbs_timer invocation. We do not want do_dbs_timer to run
+ * when user is changing the governor or limits.
+ */
+ struct mutex timer_mutex;
};
-static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
static unsigned int dbs_enable; /* number of CPUs using this policy */
/*
- * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
- * lock and dbs_mutex. cpu_hotplug lock should always be held before
- * dbs_mutex. If any function that can potentially take cpu_hotplug lock
- * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
- * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
- * is recursive for the same process. -Venki
- * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
- * would deadlock with cancel_delayed_work_sync(), which is needed for proper
- * raceless workqueue teardown.
+ * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+ * different CPUs. It protects dbs_enable in governor start/stop.
*/
static DEFINE_MUTEX(dbs_mutex);
unsigned int freq_hi, freq_lo;
unsigned int index = 0;
unsigned int jiffies_total, jiffies_hi, jiffies_lo;
- struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);
+ struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
+ policy->cpu);
if (!dbs_info->freq_table) {
dbs_info->freq_lo = 0;
return freq_hi;
}
- struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ static void ondemand_powersave_bias_init_cpu(int cpu)
+ {
++ struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+ dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+ dbs_info->freq_lo = 0;
+ }
+
static void ondemand_powersave_bias_init(void)
{
int i;
for_each_online_cpu(i) {
- struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i);
- dbs_info->freq_table = cpufreq_frequency_get_table(i);
- dbs_info->freq_lo = 0;
+ ondemand_powersave_bias_init_cpu(i);
}
}
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
+ if (ret != 1)
+ return -EINVAL;
mutex_lock(&dbs_mutex);
- if (ret != 1) {
- mutex_unlock(&dbs_mutex);
- return -EINVAL;
- }
dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate);
mutex_unlock(&dbs_mutex);
int ret;
ret = sscanf(buf, "%u", &input);
- mutex_lock(&dbs_mutex);
if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
input < MIN_FREQUENCY_UP_THRESHOLD) {
- mutex_unlock(&dbs_mutex);
return -EINVAL;
}
+ mutex_lock(&dbs_mutex);
dbs_tuners_ins.up_threshold = input;
mutex_unlock(&dbs_mutex);
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu(j) {
struct cpu_dbs_info_s *dbs_info;
- dbs_info = &per_cpu(cpu_dbs_info, j);
+ dbs_info = &per_cpu(od_cpu_dbs_info, j);
dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->prev_cpu_wall);
if (dbs_tuners_ins.ignore_nice)
struct cpufreq_policy *policy;
unsigned int j;
- if (!this_dbs_info->enable)
- return;
-
this_dbs_info->freq_lo = 0;
policy = this_dbs_info->cur_policy;
unsigned int load, load_freq;
int freq_avg;
- j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
delay -= jiffies % delay;
-
- if (lock_policy_rwsem_write(cpu) < 0)
- return;
-
- if (!dbs_info->enable) {
- unlock_policy_rwsem_write(cpu);
- return;
- }
+ mutex_lock(&dbs_info->timer_mutex);
/* Common NORMAL_SAMPLE setup */
dbs_info->sample_type = DBS_NORMAL_SAMPLE;
dbs_info->freq_lo, CPUFREQ_RELATION_H);
}
queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
- unlock_policy_rwsem_write(cpu);
+ mutex_unlock(&dbs_info->timer_mutex);
}
static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
delay -= jiffies % delay;
- dbs_info->enable = 1;
- ondemand_powersave_bias_init();
dbs_info->sample_type = DBS_NORMAL_SAMPLE;
INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
{
- dbs_info->enable = 0;
cancel_delayed_work_sync(&dbs_info->work);
}
unsigned int j;
int rc;
- this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
switch (event) {
case CPUFREQ_GOV_START:
if ((!cpu_online(cpu)) || (!policy->cur))
return -EINVAL;
- if (this_dbs_info->enable) /* Already enabled */
- break;
-
mutex_lock(&dbs_mutex);
- dbs_enable++;
rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
if (rc) {
- dbs_enable--;
mutex_unlock(&dbs_mutex);
return rc;
}
+ dbs_enable++;
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_info_s *j_dbs_info;
- j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
j_dbs_info->cur_policy = policy;
j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
}
}
this_dbs_info->cpu = cpu;
+ ondemand_powersave_bias_init_cpu(cpu);
+ mutex_init(&this_dbs_info->timer_mutex);
/*
* Start the timerschedule work, when this governor
* is used for first time
max(min_sampling_rate,
latency * LATENCY_MULTIPLIER);
}
- dbs_timer_init(this_dbs_info);
-
mutex_unlock(&dbs_mutex);
+
+ dbs_timer_init(this_dbs_info);
break;
case CPUFREQ_GOV_STOP:
- mutex_lock(&dbs_mutex);
dbs_timer_exit(this_dbs_info);
+
+ mutex_lock(&dbs_mutex);
sysfs_remove_group(&policy->kobj, &dbs_attr_group);
+ mutex_destroy(&this_dbs_info->timer_mutex);
dbs_enable--;
mutex_unlock(&dbs_mutex);
break;
case CPUFREQ_GOV_LIMITS:
- mutex_lock(&dbs_mutex);
+ mutex_lock(&this_dbs_info->timer_mutex);
if (policy->max < this_dbs_info->cur_policy->cur)
__cpufreq_driver_target(this_dbs_info->cur_policy,
policy->max, CPUFREQ_RELATION_H);
else if (policy->min > this_dbs_info->cur_policy->cur)
__cpufreq_driver_target(this_dbs_info->cur_policy,
policy->min, CPUFREQ_RELATION_L);
- mutex_unlock(&dbs_mutex);
+ mutex_unlock(&this_dbs_info->timer_mutex);
break;
}
return 0;
static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
/* Interrupt types. */
enum xen_irq_type {
return IRQ_HANDLED;
}
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
/*
* Search the CPUs pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for
struct pt_regs *old_regs = set_irq_regs(regs);
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- static DEFINE_PER_CPU(unsigned, nesting_count);
unsigned count;
exit_idle();
vcpu_info->evtchn_upcall_pending = 0;
- if (__get_cpu_var(nesting_count)++)
+ if (__get_cpu_var(xed_nesting_count)++)
goto out;
#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
BUG_ON(!irqs_disabled());
- count = __get_cpu_var(nesting_count);
- __get_cpu_var(nesting_count) = 0;
+ count = __get_cpu_var(xed_nesting_count);
+ __get_cpu_var(xed_nesting_count) = 0;
} while(count != 1);
out:
void __init xen_init_IRQ(void)
{
int i;
- size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s);
- cpu_evtchn_mask_p = alloc_bootmem(size);
+ cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+ GFP_KERNEL);
BUG_ON(cpu_evtchn_mask_p == NULL);
init_evtchn_cpu_bindings();
* EXCEPTION_TABLE(...)
* NOTES
*
- * __bss_start = .;
- * BSS_SECTION(0, 0)
- * __bss_stop = .;
+ * BSS_SECTION(0, 0, 0)
* _end = .;
*
- * /DISCARD/ : {
- * EXIT_TEXT
- * EXIT_DATA
- * EXIT_CALL
- * }
* STABS_DEBUG
* DWARF_DEBUG
+ *
+ * DISCARDS // must be the last
* }
*
* [__init_begin, __init_end] is the init section that may be freed after init
. = ALIGN(align); \
*(.data.cacheline_aligned)
- #define INIT_TASK(align) \
+ #define INIT_TASK_DATA(align) \
. = ALIGN(align); \
*(.data.init_task)
/*
* Init task
*/
- #define INIT_TASK_DATA(align) \
+ #define INIT_TASK_DATA_SECTION(align) \
. = ALIGN(align); \
.data.init_task : { \
- INIT_TASK \
+ INIT_TASK_DATA(align) \
}
#ifdef CONFIG_CONSTRUCTORS
* bss (Block Started by Symbol) - uninitialized data
* zeroed during startup
*/
- #define SBSS \
+ #define SBSS(sbss_align) \
+ . = ALIGN(sbss_align); \
.sbss : AT(ADDR(.sbss) - LOAD_OFFSET) { \
*(.sbss) \
*(.scommon) \
#define BSS(bss_align) \
. = ALIGN(bss_align); \
.bss : AT(ADDR(.bss) - LOAD_OFFSET) { \
- VMLINUX_SYMBOL(__bss_start) = .; \
*(.bss.page_aligned) \
*(.dynbss) \
*(.bss) \
*(COMMON) \
- VMLINUX_SYMBOL(__bss_stop) = .; \
}
/*
#define INIT_RAM_FS
#endif
+/*
+ * Default discarded sections.
+ *
+ * Some archs want to discard exit text/data at runtime rather than
+ * link time due to cross-section references such as alt instructions,
+ * bug table, eh_frame, etc. DISCARDS must be the last of output
+ * section definitions so that such archs put those in earlier section
+ * definitions.
+ */
+#define DISCARDS \
+ /DISCARD/ : { \
+ EXIT_TEXT \
+ EXIT_DATA \
+ EXIT_CALL \
+ *(.discard) \
+ }
+
/**
* PERCPU_VADDR - define output section for percpu area
* @vaddr: explicit base address (optional)
* matches the requirment of PAGE_ALIGNED_DATA.
*
* use 0 as page_align if page_aligned data is not used */
- #define RW_DATA_SECTION(cacheline, nosave, pagealigned, inittask) \
+ #define RW_DATA_SECTION(cacheline, pagealigned, inittask) \
. = ALIGN(PAGE_SIZE); \
.data : AT(ADDR(.data) - LOAD_OFFSET) { \
- INIT_TASK(inittask) \
+ INIT_TASK_DATA(inittask) \
CACHELINE_ALIGNED_DATA(cacheline) \
READ_MOSTLY_DATA(cacheline) \
DATA_DATA \
CONSTRUCTORS \
- NOSAVE_DATA(nosave) \
+ NOSAVE_DATA \
PAGE_ALIGNED_DATA(pagealigned) \
}
INIT_RAM_FS \
}
- #define BSS_SECTION(sbss_align, bss_align) \
- SBSS \
+ #define BSS_SECTION(sbss_align, bss_align, stop_align) \
+ . = ALIGN(sbss_align); \
+ VMLINUX_SYMBOL(__bss_start) = .; \
+ SBSS(sbss_align) \
BSS(bss_align) \
- . = ALIGN(4);
-
+ . = ALIGN(stop_align); \
+ VMLINUX_SYMBOL(__bss_stop) = .;
#define smp_init() do { } while (0)
#endif
-static inline void setup_per_cpu_areas(void) { }
static inline void setup_nr_cpu_ids(void) { }
static inline void smp_prepare_cpus(unsigned int maxcpus) { }
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-
-EXPORT_SYMBOL(__per_cpu_offset);
-
-static void __init setup_per_cpu_areas(void)
-{
- unsigned long size, i;
- char *ptr;
- unsigned long nr_possible_cpus = num_possible_cpus();
-
- /* Copy section for each CPU (we discard the original) */
- size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
- ptr = alloc_bootmem_pages(size * nr_possible_cpus);
-
- for_each_possible_cpu(i) {
- __per_cpu_offset[i] = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
- ptr += size;
- }
-}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
setup_arch(&command_line);
mm_init_owner(&init_mm, &init_task);
setup_command_line(command_line);
- setup_per_cpu_areas();
setup_nr_cpu_ids();
+ setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
build_all_zonelists();
#ifdef CONFIG_SMP
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
static void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
free_percpu(freeme);
}
-#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
/* Number of blocks used and allocated. */
static unsigned int pcpu_num_used, pcpu_num_allocated;
}
__initcall(percpu_modinit);
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
{
const unsigned long *crc;
- if (!find_symbol("module_layout", NULL, &crc, true, false))
+ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+ &crc, true, false))
BUG();
return check_version(sechdrs, versindex, "module_layout", mod, crc);
}
return ret;
}
if (ret > 0) {
- printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
- "it should follow 0/-E convention\n"
- KERN_WARNING "%s: loading module anyway...\n",
+ printk(KERN_WARNING
+ "%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
__func__, mod->name, ret,
__func__);
dump_stack();
static atomic_t nr_counters __read_mostly;
static atomic_t nr_mmap_counters __read_mostly;
static atomic_t nr_comm_counters __read_mostly;
+ static atomic_t nr_task_counters __read_mostly;
/*
* perf counter paranoia level:
void __weak hw_perf_enable(void) { barrier(); }
void __weak hw_perf_counter_setup(int cpu) { barrier(); }
+ void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
int __weak
hw_perf_group_sched_in(struct perf_counter *group_leader,
void __weak perf_counter_print_debug(void) { }
-static DEFINE_PER_CPU(int, disable_count);
+static DEFINE_PER_CPU(int, perf_disable_count);
void __perf_disable(void)
{
- __get_cpu_var(disable_count)++;
+ __get_cpu_var(perf_disable_count)++;
}
bool __perf_enable(void)
{
- return !--__get_cpu_var(disable_count);
+ return !--__get_cpu_var(perf_disable_count);
}
void perf_disable(void)
}
}
+ static void unclone_ctx(struct perf_counter_context *ctx)
+ {
+ if (ctx->parent_ctx) {
+ put_ctx(ctx->parent_ctx);
+ ctx->parent_ctx = NULL;
+ }
+ }
+
+ /*
+ * If we inherit counters we want to return the parent counter id
+ * to userspace.
+ */
+ static u64 primary_counter_id(struct perf_counter *counter)
+ {
+ u64 id = counter->id;
+
+ if (counter->parent)
+ id = counter->parent->id;
+
+ return id;
+ }
+
/*
* Get the perf_counter_context for a task and lock it.
* This has to cope with with the fact that until it is locked,
return;
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ if (counter->pending_disable) {
+ counter->pending_disable = 0;
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
counter->tstamp_stopped = ctx->time;
counter->pmu->disable(counter);
counter->oncpu = -1;
__perf_counter_sync_stat(counter, next_counter);
counter = list_next_entry(counter, event_entry);
- next_counter = list_next_entry(counter, event_entry);
+ next_counter = list_next_entry(next_counter, event_entry);
}
}
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_counter *counter, int enable);
- static void perf_log_period(struct perf_counter *counter, u64 period);
static void perf_adjust_period(struct perf_counter *counter, u64 events)
{
if (!sample_period)
sample_period = 1;
- perf_log_period(counter, sample_period);
-
hwc->sample_period = sample_period;
}
/*
* Unclone this context if we enabled any counter.
*/
- if (enabled && ctx->parent_ctx) {
- put_ctx(ctx->parent_ctx);
- ctx->parent_ctx = NULL;
- }
+ if (enabled)
+ unclone_ctx(ctx);
spin_unlock(&ctx->lock);
static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
{
- struct perf_counter_context *parent_ctx;
struct perf_counter_context *ctx;
struct perf_cpu_context *cpuctx;
struct task_struct *task;
retry:
ctx = perf_lock_task_context(task, &flags);
if (ctx) {
- parent_ctx = ctx->parent_ctx;
- if (parent_ctx) {
- put_ctx(parent_ctx);
- ctx->parent_ctx = NULL; /* no longer a clone */
- }
+ unclone_ctx(ctx);
spin_unlock_irqrestore(&ctx->lock, flags);
}
atomic_dec(&nr_mmap_counters);
if (counter->attr.comm)
atomic_dec(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_dec(&nr_task_counters);
}
if (counter->destroy)
return 0;
}
+ static int perf_counter_read_size(struct perf_counter *counter)
+ {
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += counter->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+
+ return size;
+ }
+
+ static u64 perf_counter_read_value(struct perf_counter *counter)
+ {
+ struct perf_counter *child;
+ u64 total = 0;
+
+ total += perf_counter_read(counter);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ total += perf_counter_read(child);
+
+ return total;
+ }
+
+ static int perf_counter_read_entry(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+ {
+ int n = 0, count = 0;
+ u64 values[2];
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ count = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, count))
+ return -EFAULT;
+
+ return count;
+ }
+
+ static int perf_counter_read_group(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+ {
+ struct perf_counter *leader = counter->group_leader, *sub;
+ int n = 0, size = 0, err = -EFAULT;
+ u64 values[3];
+
+ values[n++] = 1 + leader->nr_siblings;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = leader->total_time_enabled +
+ atomic64_read(&leader->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = leader->total_time_running +
+ atomic64_read(&leader->child_total_time_running);
+ }
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, size))
+ return -EFAULT;
+
+ err = perf_counter_read_entry(leader, read_format, buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ err = perf_counter_read_entry(counter, read_format,
+ buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+ }
+
+ return size;
+ }
+
+ static int perf_counter_read_one(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+ {
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ if (copy_to_user(buf, values, n * sizeof(u64)))
+ return -EFAULT;
+
+ return n * sizeof(u64);
+ }
+
/*
* Read the performance counter - simple non blocking version for now
*/
static ssize_t
perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
{
- u64 values[4];
- int n;
+ u64 read_format = counter->attr.read_format;
+ int ret;
/*
* Return end-of-file for a read on a counter that is in
if (counter->state == PERF_COUNTER_STATE_ERROR)
return 0;
+ if (count < perf_counter_read_size(counter))
+ return -ENOSPC;
+
WARN_ON_ONCE(counter->ctx->parent_ctx);
mutex_lock(&counter->child_mutex);
- values[0] = perf_counter_read(counter);
- n = 1;
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- values[n++] = counter->total_time_enabled +
- atomic64_read(&counter->child_total_time_enabled);
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- values[n++] = counter->total_time_running +
- atomic64_read(&counter->child_total_time_running);
- if (counter->attr.read_format & PERF_FORMAT_ID)
- values[n++] = counter->id;
+ if (read_format & PERF_FORMAT_GROUP)
+ ret = perf_counter_read_group(counter, read_format, buf);
+ else
+ ret = perf_counter_read_one(counter, read_format, buf);
mutex_unlock(&counter->child_mutex);
- if (count < n * sizeof(u64))
- return -EINVAL;
- count = n * sizeof(u64);
-
- if (copy_to_user(buf, values, count))
- return -EFAULT;
-
- return count;
+ return ret;
}
static ssize_t
counter->attr.sample_freq = value;
} else {
- perf_log_period(counter, value);
-
counter->attr.sample_period = value;
counter->hw.sample_period = value;
}
static void perf_mmap_free_page(unsigned long addr)
{
- struct page *page = virt_to_page(addr);
+ struct page *page = virt_to_page((void *)addr);
page->mapping = NULL;
__free_page(page);
if (counter->pending_disable) {
counter->pending_disable = 0;
- perf_counter_disable(counter);
+ __perf_counter_disable(counter);
}
if (counter->pending_wakeup) {
return task_pid_nr_ns(p, counter->ns);
}
- static void perf_counter_output(struct perf_counter *counter, int nmi,
+ static void perf_output_read_one(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+ {
+ u64 read_format = counter->attr.read_format;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = atomic64_read(&counter->count);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+ }
+
+ /*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+ static void perf_output_read_group(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+ {
+ struct perf_counter *leader = counter->group_leader, *sub;
+ u64 read_format = counter->attr.read_format;
+ u64 values[5];
+ int n = 0;
+
+ values[n++] = 1 + leader->nr_siblings;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = leader->total_time_enabled;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = leader->total_time_running;
+
+ if (leader != counter)
+ leader->pmu->read(leader);
+
+ values[n++] = atomic64_read(&leader->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(leader);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ n = 0;
+
+ if (sub != counter)
+ sub->pmu->read(sub);
+
+ values[n++] = atomic64_read(&sub->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(sub);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+ }
+ }
+
+ static void perf_output_read(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+ {
+ if (counter->attr.read_format & PERF_FORMAT_GROUP)
+ perf_output_read_group(handle, counter);
+ else
+ perf_output_read_one(handle, counter);
+ }
+
+ void perf_counter_output(struct perf_counter *counter, int nmi,
struct perf_sample_data *data)
{
int ret;
struct {
u32 pid, tid;
} tid_entry;
- struct {
- u64 id;
- u64 counter;
- } group_entry;
struct perf_callchain_entry *callchain = NULL;
int callchain_size = 0;
u64 time;
if (sample_type & PERF_SAMPLE_ID)
header.size += sizeof(u64);
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ header.size += sizeof(u64);
+
if (sample_type & PERF_SAMPLE_CPU) {
header.size += sizeof(cpu_entry);
cpu_entry.cpu = raw_smp_processor_id();
+ cpu_entry.reserved = 0;
}
if (sample_type & PERF_SAMPLE_PERIOD)
header.size += sizeof(u64);
- if (sample_type & PERF_SAMPLE_GROUP) {
- header.size += sizeof(u64) +
- counter->nr_siblings * sizeof(group_entry);
- }
+ if (sample_type & PERF_SAMPLE_READ)
+ header.size += perf_counter_read_size(counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
callchain = perf_callchain(data->regs);
header.size += sizeof(u64);
}
+ if (sample_type & PERF_SAMPLE_RAW) {
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header.size += size;
+ }
+
ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
if (ret)
return;
if (sample_type & PERF_SAMPLE_ADDR)
perf_output_put(&handle, data->addr);
- if (sample_type & PERF_SAMPLE_ID)
+ if (sample_type & PERF_SAMPLE_ID) {
+ u64 id = primary_counter_id(counter);
+
+ perf_output_put(&handle, id);
+ }
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
perf_output_put(&handle, counter->id);
if (sample_type & PERF_SAMPLE_CPU)
if (sample_type & PERF_SAMPLE_PERIOD)
perf_output_put(&handle, data->period);
- /*
- * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
- */
- if (sample_type & PERF_SAMPLE_GROUP) {
- struct perf_counter *leader, *sub;
- u64 nr = counter->nr_siblings;
-
- perf_output_put(&handle, nr);
-
- leader = counter->group_leader;
- list_for_each_entry(sub, &leader->sibling_list, list_entry) {
- if (sub != counter)
- sub->pmu->read(sub);
-
- group_entry.id = sub->id;
- group_entry.counter = atomic64_read(&sub->count);
-
- perf_output_put(&handle, group_entry);
- }
- }
+ if (sample_type & PERF_SAMPLE_READ)
+ perf_output_read(&handle, counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
if (callchain)
}
}
+ if (sample_type & PERF_SAMPLE_RAW) {
+ if (data->raw) {
+ perf_output_put(&handle, data->raw->size);
+ perf_output_copy(&handle, data->raw->data, data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(&handle, raw);
+ }
+ }
+
perf_output_end(&handle);
}
u32 pid;
u32 tid;
- u64 value;
- u64 format[3];
};
static void
.header = {
.type = PERF_EVENT_READ,
.misc = 0,
- .size = sizeof(event) - sizeof(event.format),
+ .size = sizeof(event) + perf_counter_read_size(counter),
},
.pid = perf_counter_pid(counter, task),
.tid = perf_counter_tid(counter, task),
- .value = atomic64_read(&counter->count),
};
- int ret, i = 0;
-
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- event.header.size += sizeof(u64);
- event.format[i++] = counter->total_time_enabled;
- }
-
- if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- event.header.size += sizeof(u64);
- event.format[i++] = counter->total_time_running;
- }
-
- if (counter->attr.read_format & PERF_FORMAT_ID) {
- u64 id;
-
- event.header.size += sizeof(u64);
- if (counter->parent)
- id = counter->parent->id;
- else
- id = counter->id;
-
- event.format[i++] = id;
- }
+ int ret;
ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
if (ret)
return;
- perf_output_copy(&handle, &event, event.header.size);
+ perf_output_put(&handle, event);
+ perf_output_read(&handle, counter);
+
perf_output_end(&handle);
}
/*
- * fork tracking
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
*/
- struct perf_fork_event {
- struct task_struct *task;
+ struct perf_task_event {
+ struct task_struct *task;
+ struct perf_counter_context *task_ctx;
struct {
struct perf_event_header header;
u32 pid;
u32 ppid;
+ u32 tid;
+ u32 ptid;
} event;
};
- static void perf_counter_fork_output(struct perf_counter *counter,
- struct perf_fork_event *fork_event)
+ static void perf_counter_task_output(struct perf_counter *counter,
+ struct perf_task_event *task_event)
{
struct perf_output_handle handle;
- int size = fork_event->event.header.size;
- struct task_struct *task = fork_event->task;
+ int size = task_event->event.header.size;
+ struct task_struct *task = task_event->task;
int ret = perf_output_begin(&handle, counter, size, 0, 0);
if (ret)
return;
- fork_event->event.pid = perf_counter_pid(counter, task);
- fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+ task_event->event.pid = perf_counter_pid(counter, task);
+ task_event->event.ppid = perf_counter_pid(counter, current);
+
+ task_event->event.tid = perf_counter_tid(counter, task);
+ task_event->event.ptid = perf_counter_tid(counter, current);
- perf_output_put(&handle, fork_event->event);
+ perf_output_put(&handle, task_event->event);
perf_output_end(&handle);
}
- static int perf_counter_fork_match(struct perf_counter *counter)
+ static int perf_counter_task_match(struct perf_counter *counter)
{
- if (counter->attr.comm || counter->attr.mmap)
+ if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
return 1;
return 0;
}
- static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
- struct perf_fork_event *fork_event)
+ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+ struct perf_task_event *task_event)
{
struct perf_counter *counter;
rcu_read_lock();
list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
- if (perf_counter_fork_match(counter))
- perf_counter_fork_output(counter, fork_event);
+ if (perf_counter_task_match(counter))
+ perf_counter_task_output(counter, task_event);
}
rcu_read_unlock();
}
- static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+ static void perf_counter_task_event(struct perf_task_event *task_event)
{
struct perf_cpu_context *cpuctx;
- struct perf_counter_context *ctx;
+ struct perf_counter_context *ctx = task_event->task_ctx;
cpuctx = &get_cpu_var(perf_cpu_context);
- perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+ perf_counter_task_ctx(&cpuctx->ctx, task_event);
put_cpu_var(perf_cpu_context);
rcu_read_lock();
- /*
- * doesn't really matter which of the child contexts the
- * events ends up in.
- */
- ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (!ctx)
+ ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
if (ctx)
- perf_counter_fork_ctx(ctx, fork_event);
+ perf_counter_task_ctx(ctx, task_event);
rcu_read_unlock();
}
- void perf_counter_fork(struct task_struct *task)
+ static void perf_counter_task(struct task_struct *task,
+ struct perf_counter_context *task_ctx,
+ int new)
{
- struct perf_fork_event fork_event;
+ struct perf_task_event task_event;
if (!atomic_read(&nr_comm_counters) &&
- !atomic_read(&nr_mmap_counters))
+ !atomic_read(&nr_mmap_counters) &&
+ !atomic_read(&nr_task_counters))
return;
- fork_event = (struct perf_fork_event){
- .task = task,
- .event = {
+ task_event = (struct perf_task_event){
+ .task = task,
+ .task_ctx = task_ctx,
+ .event = {
.header = {
- .type = PERF_EVENT_FORK,
- .size = sizeof(fork_event.event),
+ .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
+ .misc = 0,
+ .size = sizeof(task_event.event),
},
+ /* .pid */
+ /* .ppid */
+ /* .tid */
+ /* .ptid */
},
};
- perf_counter_fork_event(&fork_event);
+ perf_counter_task_event(&task_event);
+ }
+
+ void perf_counter_fork(struct task_struct *task)
+ {
+ perf_counter_task(task, NULL, 1);
}
/*
struct perf_cpu_context *cpuctx;
struct perf_counter_context *ctx;
unsigned int size;
- char *comm = comm_event->task->comm;
+ char comm[TASK_COMM_LEN];
+ memset(comm, 0, sizeof(comm));
+ strncpy(comm, comm_event->task->comm, sizeof(comm));
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
comm_event = (struct perf_comm_event){
.task = task,
+ /* .comm */
+ /* .comm_size */
.event = {
- .header = { .type = PERF_EVENT_COMM, },
+ .header = {
+ .type = PERF_EVENT_COMM,
+ .misc = 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
},
};
char *buf = NULL;
const char *name;
+ memset(tmp, 0, sizeof(tmp));
+
if (file) {
- buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ /*
+ * d_path works from the end of the buffer backwards, so we
+ * need to add enough zero bytes after the string to handle
+ * the 64bit alignment we do later.
+ */
+ buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
if (!buf) {
name = strncpy(tmp, "//enomem", sizeof(tmp));
goto got_name;
goto got_name;
}
} else {
- name = arch_vma_name(mmap_event->vma);
- if (name)
+ if (arch_vma_name(mmap_event->vma)) {
+ name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+ sizeof(tmp));
goto got_name;
+ }
if (!vma->vm_mm) {
name = strncpy(tmp, "[vdso]", sizeof(tmp));
mmap_event = (struct perf_mmap_event){
.vma = vma,
+ /* .file_name */
+ /* .file_size */
.event = {
- .header = { .type = PERF_EVENT_MMAP, },
+ .header = {
+ .type = PERF_EVENT_MMAP,
+ .misc = 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
.start = vma->vm_start,
.len = vma->vm_end - vma->vm_start,
.pgoff = vma->vm_pgoff,
perf_counter_mmap_event(&mmap_event);
}
- /*
- * Log sample_period changes so that analyzing tools can re-normalize the
- * event flow.
- */
-
- struct freq_event {
- struct perf_event_header header;
- u64 time;
- u64 id;
- u64 period;
- };
-
- static void perf_log_period(struct perf_counter *counter, u64 period)
- {
- struct perf_output_handle handle;
- struct freq_event event;
- int ret;
-
- if (counter->hw.sample_period == period)
- return;
-
- if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
- return;
-
- event = (struct freq_event) {
- .header = {
- .type = PERF_EVENT_PERIOD,
- .misc = 0,
- .size = sizeof(event),
- },
- .time = sched_clock(),
- .id = counter->id,
- .period = period,
- };
-
- ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
- if (ret)
- return;
-
- perf_output_put(&handle, event);
- perf_output_end(&handle);
- }
-
/*
* IRQ throttle logging
*/
struct perf_event_header header;
u64 time;
u64 id;
+ u64 stream_id;
} throttle_event = {
.header = {
- .type = PERF_EVENT_THROTTLE + 1,
+ .type = PERF_EVENT_THROTTLE,
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = sched_clock(),
- .id = counter->id,
+ .time = sched_clock(),
+ .id = primary_counter_id(counter),
+ .stream_id = counter->id,
};
+ if (enable)
+ throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+
ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
if (ret)
return;
* Generic software counter infrastructure
*/
- static void perf_swcounter_update(struct perf_counter *counter)
+ /*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+ static u64 perf_swcounter_set_period(struct perf_counter *counter)
{
struct hw_perf_counter *hwc = &counter->hw;
- u64 prev, now;
- s64 delta;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
again:
- prev = atomic64_read(&hwc->prev_count);
- now = atomic64_read(&hwc->count);
- if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
- goto again;
+ old = val = atomic64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
- delta = now - prev;
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
- atomic64_add(delta, &counter->count);
- atomic64_sub(delta, &hwc->period_left);
+ return nr;
}
- static void perf_swcounter_set_period(struct perf_counter *counter)
+ static void perf_swcounter_overflow(struct perf_counter *counter,
+ int nmi, struct perf_sample_data *data)
{
struct hw_perf_counter *hwc = &counter->hw;
- s64 left = atomic64_read(&hwc->period_left);
- s64 period = hwc->sample_period;
+ u64 overflow;
- if (unlikely(left <= -period)) {
- left = period;
- atomic64_set(&hwc->period_left, left);
- hwc->last_period = period;
- }
+ data->period = counter->hw.last_period;
+ overflow = perf_swcounter_set_period(counter);
- if (unlikely(left <= 0)) {
- left += period;
- atomic64_add(period, &hwc->period_left);
- hwc->last_period = period;
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
- atomic64_set(&hwc->prev_count, -left);
- atomic64_set(&hwc->count, -left);
+ for (; overflow; overflow--) {
+ if (perf_counter_overflow(counter, nmi, data)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ }
}
- static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+ static void perf_swcounter_unthrottle(struct perf_counter *counter)
{
- enum hrtimer_restart ret = HRTIMER_RESTART;
- struct perf_sample_data data;
- struct perf_counter *counter;
- u64 period;
-
- counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
- counter->pmu->read(counter);
-
- data.addr = 0;
- data.regs = get_irq_regs();
/*
- * In case we exclude kernel IPs or are somehow not in interrupt
- * context, provide the next best thing, the user IP.
+ * Nothing to do, we already reset hwc->interrupts.
*/
- if ((counter->attr.exclude_kernel || !data.regs) &&
- !counter->attr.exclude_user)
- data.regs = task_pt_regs(current);
+ }
- if (data.regs) {
- if (perf_counter_overflow(counter, 0, &data))
- ret = HRTIMER_NORESTART;
- }
+ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+ int nmi, struct perf_sample_data *data)
+ {
+ struct hw_perf_counter *hwc = &counter->hw;
- period = max_t(u64, 10000, counter->hw.sample_period);
- hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+ atomic64_add(nr, &counter->count);
- return ret;
- }
+ if (!hwc->sample_period)
+ return;
- static void perf_swcounter_overflow(struct perf_counter *counter,
- int nmi, struct perf_sample_data *data)
- {
- data->period = counter->hw.last_period;
+ if (!data->regs)
+ return;
- perf_swcounter_update(counter);
- perf_swcounter_set_period(counter);
- if (perf_counter_overflow(counter, nmi, data))
- /* soft-disable the counter */
- ;
+ if (!atomic64_add_negative(nr, &hwc->period_left))
+ perf_swcounter_overflow(counter, nmi, data);
}
static int perf_swcounter_is_counting(struct perf_counter *counter)
{
- struct perf_counter_context *ctx;
- unsigned long flags;
- int count;
-
+ /*
+ * The counter is active, we're good!
+ */
if (counter->state == PERF_COUNTER_STATE_ACTIVE)
return 1;
+ /*
+ * The counter is off/error, not counting.
+ */
if (counter->state != PERF_COUNTER_STATE_INACTIVE)
return 0;
/*
- * If the counter is inactive, it could be just because
- * its task is scheduled out, or because it's in a group
- * which could not go on the PMU. We want to count in
- * the first case but not the second. If the context is
- * currently active then an inactive software counter must
- * be the second case. If it's not currently active then
- * we need to know whether the counter was active when the
- * context was last active, which we can determine by
- * comparing counter->tstamp_stopped with ctx->time.
- *
- * We are within an RCU read-side critical section,
- * which protects the existence of *ctx.
+ * The counter is inactive, if the context is active
+ * we're part of a group that didn't make it on the 'pmu',
+ * not counting.
*/
- ctx = counter->ctx;
- spin_lock_irqsave(&ctx->lock, flags);
- count = 1;
- /* Re-check state now we have the lock */
- if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
- counter->ctx->is_active ||
- counter->tstamp_stopped < ctx->time)
- count = 0;
- spin_unlock_irqrestore(&ctx->lock, flags);
- return count;
+ if (counter->ctx->is_active)
+ return 0;
+
+ /*
+ * We're inactive and the context is too, this means the
+ * task is scheduled out, we're counting events that happen
+ * to us, like migration events.
+ */
+ return 1;
}
static int perf_swcounter_match(struct perf_counter *counter,
return 1;
}
- static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
- int nmi, struct perf_sample_data *data)
- {
- int neg = atomic64_add_negative(nr, &counter->hw.count);
-
- if (counter->hw.sample_period && !neg && data->regs)
- perf_swcounter_overflow(counter, nmi, data);
- }
-
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
enum perf_type_id type,
u32 event, u64 nr, int nmi,
static void perf_swcounter_read(struct perf_counter *counter)
{
- perf_swcounter_update(counter);
}
static int perf_swcounter_enable(struct perf_counter *counter)
{
- perf_swcounter_set_period(counter);
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (hwc->sample_period) {
+ hwc->last_period = hwc->sample_period;
+ perf_swcounter_set_period(counter);
+ }
return 0;
}
static void perf_swcounter_disable(struct perf_counter *counter)
{
- perf_swcounter_update(counter);
}
static const struct pmu perf_ops_generic = {
.enable = perf_swcounter_enable,
.disable = perf_swcounter_disable,
.read = perf_swcounter_read,
+ .unthrottle = perf_swcounter_unthrottle,
};
+ /*
+ * hrtimer based swcounter callback
+ */
+
+ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+ {
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct perf_counter *counter;
+ u64 period;
+
+ counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+ counter->pmu->read(counter);
+
+ data.addr = 0;
+ data.regs = get_irq_regs();
+ /*
+ * In case we exclude kernel IPs or are somehow not in interrupt
+ * context, provide the next best thing, the user IP.
+ */
+ if ((counter->attr.exclude_kernel || !data.regs) &&
+ !counter->attr.exclude_user)
+ data.regs = task_pt_regs(current);
+
+ if (data.regs) {
+ if (perf_counter_overflow(counter, 0, &data))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, counter->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+ }
+
/*
* Software counter: cpu wall time clock
*/
};
#ifdef CONFIG_EVENT_PROFILE
- void perf_tpcounter_event(int event_id)
+ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+ int entry_size)
{
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
struct perf_sample_data data = {
- .regs = get_irq_regs();
- .addr = 0,
+ .regs = get_irq_regs(),
+ .addr = addr,
+ .raw = &raw,
};
if (!data.regs)
data.regs = task_pt_regs(current);
- do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
}
EXPORT_SYMBOL_GPL(perf_tpcounter_event);
static void tp_perf_counter_destroy(struct perf_counter *counter)
{
- ftrace_profile_disable(perf_event_id(&counter->attr));
+ ftrace_profile_disable(counter->attr.config);
}
static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
{
- int event_id = perf_event_id(&counter->attr);
- int ret;
+ /*
+ * Raw tracepoint data is a severe data leak, only allow root to
+ * have these.
+ */
+ if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
- ret = ftrace_profile_enable(event_id);
- if (ret)
+ if (ftrace_profile_enable(counter->attr.config))
return NULL;
counter->destroy = tp_perf_counter_destroy;
atomic64_set(&hwc->period_left, hwc->sample_period);
/*
- * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+ * we currently do not support PERF_FORMAT_GROUP on inherited counters
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
goto done;
switch (attr->type) {
atomic_inc(&nr_mmap_counters);
if (counter->attr.comm)
atomic_inc(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_inc(&nr_task_counters);
}
return counter;
struct perf_counter_context *child_ctx;
unsigned long flags;
- if (likely(!child->perf_counter_ctxp))
+ if (likely(!child->perf_counter_ctxp)) {
+ perf_counter_task(child, NULL, 0);
return;
+ }
local_irq_save(flags);
/*
*/
spin_lock(&child_ctx->lock);
child->perf_counter_ctxp = NULL;
- if (child_ctx->parent_ctx) {
- /*
- * This context is a clone; unclone it so it can't get
- * swapped to another process while we're removing all
- * the counters from it.
- */
- put_ctx(child_ctx->parent_ctx);
- child_ctx->parent_ctx = NULL;
- }
- spin_unlock(&child_ctx->lock);
- local_irq_restore(flags);
+ /*
+ * If this context is a clone; unclone it so it can't get
+ * swapped to another process while we're removing all
+ * the counters from it.
+ */
+ unclone_ctx(child_ctx);
+ spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ /*
+ * Report the task dead after unscheduling the counters so that we
+ * won't get any samples after PERF_EVENT_EXIT. We can however still
+ * get a few PERF_EVENT_READ events.
+ */
+ perf_counter_task(child, child_ctx, 0);
/*
* We can recurse on the same lock type through:
perf_counter_init_cpu(cpu);
break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ hw_perf_counter_setup_online(cpu);
+ break;
+
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
perf_counter_exit_cpu(cpu);
{
perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
+ perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+ (void *)(long)smp_processor_id());
register_cpu_notifier(&perf_cpu_nb);
}
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
#endif /* CONFIG_RT_GROUP_SCHED */
#else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.sum_sleep_runtime = 0;
- p->se.sleep_start = 0;
- p->se.block_start = 0;
- p->se.sleep_max = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
- p->se.wait_max = 0;
+ p->se.wait_start = 0;
+ p->se.wait_max = 0;
+ p->se.wait_count = 0;
+ p->se.wait_sum = 0;
+
+ p->se.sleep_start = 0;
+ p->se.sleep_max = 0;
+ p->se.sum_sleep_runtime = 0;
+
+ p->se.block_start = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+
+ p->se.nr_migrations_cold = 0;
+ p->se.nr_failed_migrations_affine = 0;
+ p->se.nr_failed_migrations_running = 0;
+ p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_forced_migrations = 0;
+ p->se.nr_forced2_migrations = 0;
+
+ p->se.nr_wakeups = 0;
+ p->se.nr_wakeups_sync = 0;
+ p->se.nr_wakeups_migrate = 0;
+ p->se.nr_wakeups_local = 0;
+ p->se.nr_wakeups_remote = 0;
+ p->se.nr_wakeups_affine = 0;
+ p->se.nr_wakeups_affine_attempts = 0;
+ p->se.nr_wakeups_passive = 0;
+ p->se.nr_wakeups_idle = 0;
+
#endif
INIT_LIST_HEAD(&p->rt.run_list);
return 0;
}
+ static inline int should_resched(void)
+ {
+ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+ }
+
static void __cond_resched(void)
{
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
int __sched _cond_resched(void)
{
- if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
- system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
__cond_resched();
return 1;
}
*/
int cond_resched_lock(spinlock_t *lock)
{
- int resched = need_resched() && system_state == SYSTEM_RUNNING;
+ int resched = should_resched();
int ret = 0;
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched && need_resched())
+ if (resched)
__cond_resched();
else
cpu_relax();
{
BUG_ON(!in_softirq());
- if (need_resched() && system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
local_bh_enable();
__cond_resched();
local_bh_disable();
static void calc_global_load_remove(struct rq *rq)
{
atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
task_rq_unlock(rq, &flags);
get_task_struct(p);
cpu_rq(cpu)->migration_thread = p;
+ rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
- rq->calc_load_update = calc_load_update;
- rq->calc_load_active = 0;
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
- plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+ plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
const struct seq_operations *seq_ops;
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_clear_events();
seq_ops = inode->i_private;
entry = trace_create_file("enable", 0644, call->dir, call,
enable);
- if (call->id)
+ if (call->id && call->profile_enable)
entry = trace_create_file("id", 0444, call->dir, call,
id);
#ifdef CONFIG_FUNCTION_TRACER
-static DEFINE_PER_CPU(atomic_t, test_event_disable);
+static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
static void
function_test_events_call(unsigned long ip, unsigned long parent_ip)
pc = preempt_count();
resched = ftrace_preempt_disable();
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+ disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
if (disabled != 1)
goto out;
trace_nowake_buffer_unlock_commit(event, flags, pc);
out:
- atomic_dec(&per_cpu(test_event_disable, cpu));
+ atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
ftrace_preempt_enable(resched);
}
if (pages_written >= write_chunk)
break; /* We've done our duty */
- congestion_wait(WRITE, HZ/10);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
}
if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
}
}
+static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
* @mapping: address_space which was dirtied
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
- static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
unsigned long ratelimit;
unsigned long *p;
* tasks in balance_dirty_pages(). Period.
*/
preempt_disable();
- p = &__get_cpu_var(ratelimits);
+ p = &__get_cpu_var(bdp_ratelimits);
*p += nr_pages_dirtied;
if (unlikely(*p >= ratelimit)) {
*p = 0;
if (global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
- congestion_wait(WRITE, HZ/10);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
/*
* The caller might hold locks which can prevent IO completion
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */
if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(WRITE, HZ/10);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
else
break;
}
writeback_inodes(&wbc);
if (wbc.nr_to_write > 0) {
if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(WRITE, HZ/10);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
else
break; /* All the old data is written */
}
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of nr_cpu_ids units and the first chunk is used
- * for static percpu variables in the kernel image (special boot time
- * alloc/init handling necessary as these areas need to be brought up
- * before allocation services are running). Unit grows as necessary
- * and all units grow or shrink in unison. When a chunk is filled up,
- * another chunk is allocated. ie. in vmalloc area
+ * chunk is consisted of boot-time determined number of units and the
+ * first chunk is used for static percpu variables in the kernel image
+ * (special boot time alloc/init handling necessary as these areas
+ * need to be brought up before allocation services are running).
+ * Unit grows as necessary and all units grow or shrink in unison.
+ * When a chunk is filled up, another chunk is allocated. ie. in
+ * vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
- * percpu base registers pcpu_unit_size apart.
+ * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
+ * cpus. On NUMA, the mapping can be non-linear and even sparse.
+ * Percpu access can be done by configuring percpu base registers
+ * according to cpu to unit mapping and pcpu_unit_size.
*
- * There are usually many small percpu allocations many of them as
- * small as 4 bytes. The allocator organizes chunks into lists
+ * There are usually many small percpu allocations many of them being
+ * as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
*
* To use this allocator, arch code should do the followings.
*
- * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/list.h>
+#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
bool immutable; /* no [de]population allowed */
- struct page **page; /* points to page array */
- struct page *page_ar[]; /* #cpus * UNIT_PAGES */
+ unsigned long populated[]; /* populated bitmap */
};
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
+static int pcpu_nr_units __read_mostly;
static int pcpu_chunk_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
+/* cpus with the lowest and highest unit numbers */
+static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_last_unit_cpu __read_mostly;
+
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
+/* cpu -> unit map */
+const int *pcpu_unit_map __read_mostly;
+
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* Synchronization rules.
*
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
- * protects allocation/reclaim paths, chunks and chunk->page arrays.
- * The latter is a spinlock and protects the index data structures -
- * chunk slots, chunks and area maps in chunks.
+ * protects allocation/reclaim paths, chunks, populated bitmap and
+ * vmalloc mapping. The latter is a spinlock and protects the index
+ * data structures - chunk slots, chunks and area maps in chunks.
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
- return cpu * pcpu_unit_pages + page_idx;
-}
-
-static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
-{
- return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
}
-static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
- int page_idx)
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
{
- return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}
/* set the pointer to a chunk in a page struct */
return (struct pcpu_chunk *)page->index;
}
+static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_zero_bit(chunk->populated, end, *rs);
+ *re = find_next_bit(chunk->populated, end, *rs + 1);
+}
+
+static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+{
+ *rs = find_next_bit(chunk->populated, end, *rs);
+ *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+}
+
+/*
+ * (Un)populated page region iterators. Iterate over (un)populated
+ * page regions betwen @start and @end in @chunk. @rs and @re should
+ * be integer variables and will be set to start and end page index of
+ * the current region.
+ */
+#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+
/**
* pcpu_mem_alloc - allocate memory
* @size: bytes to allocate
void *first_start = pcpu_first_chunk->vm->addr;
/* is it in the first chunk? */
- if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+ if (addr >= first_start && addr < first_start + pcpu_unit_size) {
/* is it in the reserved area? */
if (addr < first_start + pcpu_reserved_chunk_limit)
return pcpu_reserved_chunk;
return pcpu_first_chunk;
}
+ /*
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
+ */
+ addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
return pcpu_get_page_chunk(vmalloc_to_page(addr));
}
}
/**
- * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
* @chunk: chunk of interest
- * @page_start: page index of the first page to unmap
- * @page_end: page index of the last page to unmap + 1
- * @flush_tlb: whether to flush tlb or not
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
*
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- * If @flush is true, vcache is flushed before unmapping and tlb
- * after.
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx(). The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated. Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
*/
-static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
- bool flush_tlb)
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ unsigned long **bitmapp,
+ bool may_alloc)
{
- unsigned int last = nr_cpu_ids - 1;
- unsigned int cpu;
+ static struct page **pages;
+ static unsigned long *bitmap;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ sizeof(unsigned long);
+
+ if (!pages || !bitmap) {
+ if (may_alloc && !pages)
+ pages = pcpu_mem_alloc(pages_size);
+ if (may_alloc && !bitmap)
+ bitmap = pcpu_mem_alloc(bitmap_size);
+ if (!pages || !bitmap)
+ return NULL;
+ }
- /* unmap must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ memset(pages, 0, pages_size);
+ bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
- /*
- * Each flushing trial can be very expensive, issue flush on
- * the whole region at once rather than doing it for each cpu.
- * This could be an overkill but is more scalable.
- */
- flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+ *bitmapp = bitmap;
+ return pages;
+}
- for_each_possible_cpu(cpu)
- unmap_kernel_range_noflush(
- pcpu_chunk_addr(chunk, cpu, page_start),
- (page_end - page_start) << PAGE_SHIFT);
-
- /* ditto as flush_cache_vunmap() */
- if (flush_tlb)
- flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
}
/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk. If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
*/
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
- bool flush)
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- int unmap_start = -1;
- int uninitialized_var(unmap_end);
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
unsigned int cpu;
int i;
- for (i = page_start; i < page_end; i++) {
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep) {
+ pcpu_free_pages(chunk, pages, populated,
+ page_start, page_end);
+ return -ENOMEM;
+ }
+ }
+ }
+ return 0;
+}
- if (!*pagep)
- continue;
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
- __free_page(*pagep);
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
- /*
- * If it's partial depopulation, it might get
- * populated or depopulated again. Mark the
- * page gone.
- */
- *pagep = NULL;
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
- unmap_start = unmap_start < 0 ? i : unmap_start;
- unmap_end = i + 1;
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
+
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
}
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
}
- if (unmap_start >= 0)
- pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+ for (i = page_start; i < page_end; i++)
+ __clear_bit(i, populated);
}
/**
- * pcpu_map - map pages into a pcpu_chunk
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ int nr_pages)
+{
+ return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ PAGE_KERNEL, pages);
+}
+
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
- * For each cpu, map pages [@page_start,@page_end) into @chunk.
- * vcache is flushed afterwards.
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
*/
-static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
{
- unsigned int last = nr_cpu_ids - 1;
- unsigned int cpu;
- int err;
-
- /* map must not be done on immutable chunk */
- WARN_ON(chunk->immutable);
+ unsigned int cpu, tcpu;
+ int i, err;
for_each_possible_cpu(cpu) {
- err = map_kernel_range_noflush(
- pcpu_chunk_addr(chunk, cpu, page_start),
- (page_end - page_start) << PAGE_SHIFT,
- PAGE_KERNEL,
- pcpu_chunk_pagep(chunk, cpu, page_start));
+ err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
+ page_end - page_start);
if (err < 0)
- return err;
+ goto err;
+ }
+
+ /* mapping successful, link chunk and mark populated */
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ __set_bit(i, populated);
}
- /* flush at once, please read comments in pcpu_unmap() */
- flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
return 0;
+
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ struct page **pages;
+ unsigned long *populated;
+ int rs, re;
+
+ /* quick path, check whether it's empty already */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ return;
+ break;
+ }
+
+ /* immutable chunks can't be depopulated */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
}
/**
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
- const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
- int map_start = -1;
- int uninitialized_var(map_end);
+ int free_end = page_start, unmap_end = page_start;
+ struct page **pages;
+ unsigned long *populated;
unsigned int cpu;
- int i;
+ int rs, re, rc;
- for (i = page_start; i < page_end; i++) {
- if (pcpu_chunk_page_occupied(chunk, i)) {
- if (map_start >= 0) {
- if (pcpu_map(chunk, map_start, map_end))
- goto err;
- map_start = -1;
- }
- continue;
- }
+ /* quick path, check whether all pages are already there */
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+ if (rs == page_start && re == page_end)
+ goto clear;
+ break;
+ }
- map_start = map_start < 0 ? i : map_start;
- map_end = i + 1;
+ /* need to allocate and map pages, this chunk can't be immutable */
+ WARN_ON(chunk->immutable);
- for_each_possible_cpu(cpu) {
- struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ if (!pages)
+ return -ENOMEM;
- *pagep = alloc_pages_node(cpu_to_node(cpu),
- alloc_mask, 0);
- if (!*pagep)
- goto err;
- pcpu_set_page_chunk(*pagep, chunk);
- }
+ /* alloc and map */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_free;
+ free_end = re;
}
- if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
- goto err;
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_unmap;
+ unmap_end = re;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
for_each_possible_cpu(cpu)
- memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
- size);
-
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
return 0;
-err:
- /* likely under heavy memory pressure, give memory back */
- pcpu_depopulate_chunk(chunk, off, size, true);
- return -ENOMEM;
+
+err_unmap:
+ pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+ return rc;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
- chunk->page = chunk->page_ar;
- chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+ chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
mutex_unlock(&pcpu_alloc_mutex);
+ /* return address relative to unit0 */
return __addr_to_pcpu_ptr(chunk->vm->addr + off);
fail_unlock:
mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) {
- pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+ pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
}
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
- * @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
- * @reserved_size: the size of reserved percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes, 0 for none
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
- * @base_addr: mapped address, NULL for auto
- * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ * @base_addr: mapped address
+ * @unit_map: cpu -> unit map, NULL for sequential mapping
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
- * setup path. The first two parameters are mandatory. The rest are
- * optional.
- *
- * @get_page_fn() should return pointer to percpu page given cpu
- * number and page number. It should at least return enough pages to
- * cover the static area. The returned pages for static area should
- * have been initialized with valid data. If @unit_size is specified,
- * it can also return pages after the static area. NULL return
- * indicates end of pages for the cpu. Note that @get_page_fn() must
- * return the same number of pages for all cpus.
+ * setup path.
*
* @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* non-negative value makes percpu leave alone the area beyond
* @static_size + @reserved_size + @dyn_size.
*
- * @unit_size, if non-negative, specifies unit size and must be
- * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @reserved_size + if non-negative, @dyn_size.
- *
- * Non-null @base_addr means that the caller already allocated virtual
- * region for the first chunk and mapped it. percpu must not mess
- * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
- * @populate_pte_fn doesn't make any sense.
+ * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
+ * equal to or larger than @static_size + @reserved_size + if
+ * non-negative, @dyn_size.
*
- * @populate_pte_fn is used to populate the pagetable. NULL means the
- * caller already populated the pagetable.
+ * The caller should have mapped the first chunk at @base_addr and
+ * copied static data to each unit.
*
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
-size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- size_t static_size, size_t reserved_size,
- ssize_t dyn_size, ssize_t unit_size,
- void *base_addr,
- pcpu_populate_pte_fn_t populate_pte_fn)
+size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
+ ssize_t dyn_size, size_t unit_size,
+ void *base_addr, const int *unit_map)
{
static struct vm_struct first_vm;
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0);
struct pcpu_chunk *schunk, *dchunk = NULL;
- unsigned int cpu;
- int nr_pages;
- int err, i;
+ unsigned int cpu, tcpu;
+ int i;
- /* santiy checks */
+ /* sanity checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
- if (unit_size >= 0) {
- BUG_ON(unit_size < size_sum);
- BUG_ON(unit_size & ~PAGE_MASK);
- BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
- } else
- BUG_ON(base_addr);
- BUG_ON(base_addr && populate_pte_fn);
-
- if (unit_size >= 0)
- pcpu_unit_pages = unit_size >> PAGE_SHIFT;
- else
- pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
- PFN_UP(size_sum));
+ BUG_ON(!base_addr);
+ BUG_ON(unit_size < size_sum);
+ BUG_ON(unit_size & ~PAGE_MASK);
+ BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+
+ /* determine number of units and verify and initialize pcpu_unit_map */
+ if (unit_map) {
+ int first_unit = INT_MAX, last_unit = INT_MIN;
+
+ for_each_possible_cpu(cpu) {
+ int unit = unit_map[cpu];
+
+ BUG_ON(unit < 0);
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ /* the mapping should be one-to-one */
+ BUG_ON(unit_map[tcpu] == unit);
+ }
+
+ if (unit < first_unit) {
+ pcpu_first_unit_cpu = cpu;
+ first_unit = unit;
+ }
+ if (unit > last_unit) {
+ pcpu_last_unit_cpu = cpu;
+ last_unit = unit;
+ }
+ }
+ pcpu_nr_units = last_unit + 1;
+ pcpu_unit_map = unit_map;
+ } else {
+ int *identity_map;
+
+ /* #units == #cpus, identity mapped */
- identity_map = alloc_bootmem(num_possible_cpus() *
++ identity_map = alloc_bootmem(nr_cpu_ids *
+ sizeof(identity_map[0]));
- pcpu_nr_units = num_possible_cpus();
+ for_each_possible_cpu(cpu)
+ identity_map[cpu] = cpu;
+
+ pcpu_first_unit_cpu = 0;
+ pcpu_last_unit_cpu = pcpu_nr_units - 1;
++ pcpu_nr_units = nr_cpu_ids;
+ pcpu_unit_map = identity_map;
+ }
+
+ /* determine basic parameters */
+ pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
- pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
+ pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
+ pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size;
+ first_vm.flags = VM_ALLOC;
+ first_vm.size = pcpu_chunk_size;
+ first_vm.addr = base_addr;
+
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
- schunk->page = schunk->page_ar;
+ schunk->immutable = true;
+ bitmap_fill(schunk->populated, pcpu_unit_pages);
if (reserved_size) {
schunk->free_size = reserved_size;
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+ dchunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
- dchunk->page = schunk->page_ar; /* share page map with schunk */
+ dchunk->immutable = true;
+ bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
- /* allocate vm address */
- first_vm.flags = VM_ALLOC;
- first_vm.size = pcpu_chunk_size;
-
- if (!base_addr)
- vm_area_register_early(&first_vm, PAGE_SIZE);
- else {
- /*
- * Pages already mapped. No need to remap into
- * vmalloc area. In this case the first chunks can't
- * be mapped or unmapped by percpu and are marked
- * immutable.
- */
- first_vm.addr = base_addr;
- schunk->immutable = true;
- if (dchunk)
- dchunk->immutable = true;
- }
-
- /* assign pages */
- nr_pages = -1;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < pcpu_unit_pages; i++) {
- struct page *page = get_page_fn(cpu, i);
-
- if (!page)
- break;
- *pcpu_chunk_pagep(schunk, cpu, i) = page;
- }
-
- BUG_ON(i < PFN_UP(static_size));
-
- if (nr_pages < 0)
- nr_pages = i;
- else
- BUG_ON(nr_pages != i);
- }
-
- /* map them */
- if (populate_pte_fn) {
- for_each_possible_cpu(cpu)
- for (i = 0; i < nr_pages; i++)
- populate_pte_fn(pcpu_chunk_addr(schunk,
- cpu, i));
-
- err = pcpu_map(schunk, 0, nr_pages);
- if (err)
- panic("failed to setup static percpu area, err=%d\n",
- err);
- }
-
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* we're done */
- pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
+ pcpu_base_addr = schunk->vm->addr;
return pcpu_unit_size;
}
-/*
- * Embedding first chunk setup helper.
- */
-static void *pcpue_ptr __initdata;
-static size_t pcpue_size __initdata;
-static size_t pcpue_unit_size __initdata;
-
-static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
+ ssize_t *dyn_sizep)
{
- size_t off = (size_t)pageno << PAGE_SHIFT;
+ size_t size_sum;
- if (off >= pcpue_size)
- return NULL;
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+ if (*dyn_sizep != 0)
+ *dyn_sizep = size_sum - static_size - reserved_size;
- return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
+ return size_sum;
}
/**
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
* page size.
*
* When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment. Also, when @dyn_size is auto,
- * @dyn_size does not fill the whole first chunk but only what's
- * necessary for page alignment after static and reserved areas.
+ * specified to fill page alignment. When @dyn_size is auto,
+ * @dyn_size is just big enough to fill page alignment after static
+ * and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
- ssize_t dyn_size, ssize_t unit_size)
+ ssize_t dyn_size)
{
- size_t chunk_size;
+ size_t size_sum, unit_size, chunk_size;
+ void *base;
unsigned int cpu;
/* determine parameters and allocate */
- pcpue_size = PFN_ALIGN(static_size + reserved_size +
- (dyn_size >= 0 ? dyn_size : 0));
- if (dyn_size != 0)
- dyn_size = pcpue_size - static_size - reserved_size;
-
- if (unit_size >= 0) {
- BUG_ON(unit_size < pcpue_size);
- pcpue_unit_size = unit_size;
- } else
- pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-
- chunk_size = pcpue_unit_size * nr_cpu_ids;
-
- pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
- if (!pcpue_ptr) {
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+
+ unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
- chunk_size = unit_size * num_possible_cpus();
++ chunk_size = unit_size * nr_cpu_ids;
+
+ base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
return -ENOMEM;
}
/* return the leftover and copy */
- for_each_possible_cpu(cpu) {
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+ void *ptr = base + cpu * unit_size;
- free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
- memcpy(ptr, __per_cpu_load, static_size);
+ if (cpu_possible(cpu)) {
- free_bootmem(__pa(ptr + pcpue_size),
- pcpue_unit_size - pcpue_size);
++ free_bootmem(__pa(ptr + size_sum),
++ unit_size - size_sum);
+ memcpy(ptr, __per_cpu_load, static_size);
+ } else
- free_bootmem(__pa(ptr), pcpue_unit_size);
++ free_bootmem(__pa(ptr), unit_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ size_sum >> PAGE_SHIFT, base, static_size);
+
+ return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, base, NULL);
+}
+
+/**
+ * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+ static struct vm_struct vm;
+ int unit_pages;
+ size_t pages_size;
+ struct page **pages;
+ unsigned int cpu;
+ int i, j;
+ ssize_t ret;
+
+ unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+ PCPU_MIN_UNIT_SIZE));
+
+ /* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
- sizeof(pages[0]));
++ pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+ pages = alloc_bootmem(pages_size);
+
+ /* allocate pages */
+ j = 0;
+ for_each_possible_cpu(cpu)
+ for (i = 0; i < unit_pages; i++) {
+ void *ptr;
+
+ ptr = alloc_fn(cpu, PAGE_SIZE);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate "
+ "4k page for cpu%u\n", cpu);
+ goto enomem;
+ }
+ pages[j++] = virt_to_page(ptr);
+ }
+
+ /* allocate vm area, map the pages and copy static data */
+ vm.flags = VM_ALLOC;
- vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
++ vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
+ vm_area_register_early(&vm, PAGE_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long unit_addr = (unsigned long)vm.addr +
+ (cpu * unit_pages << PAGE_SHIFT);
+
+ for (i = 0; i < unit_pages; i++)
+ populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+
+ /* pte already populated, the following shouldn't fail */
+ ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+ unit_pages);
+ if (ret < 0)
+ panic("failed to map percpu area, err=%zd\n", ret);
+
+ /*
+ * FIXME: Archs with virtual cache should flush local
+ * cache for the linear mapping here - something
+ * equivalent to flush_cache_vmap() on the local cpu.
+ * flush_cache_vmap() can't be used as most supporting
+ * data structures are not set up yet.
+ */
+
+ /* copy static data */
+ memcpy((void *)unit_addr, __per_cpu_load, static_size);
+ }
+
+ /* we're ready, commit */
+ pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
+ unit_pages, static_size);
+
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+ unit_pages << PAGE_SHIFT, vm.addr, NULL);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_fn(page_address(pages[j]), PAGE_SIZE);
+ ret = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pages), pages_size);
+ return ret;
+}
+
+/*
+ * Large page remapping first chunk setup helper
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+
+/**
+ * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ * @unit_sizep: out parameter for unit size
+ * @unit_map: unit_map to be filled
+ * @cpu_distance_fn: callback to determine distance between cpus
+ *
+ * This function builds cpu -> unit map and determine other parameters
+ * considering needed percpu size, large page size and distances
+ * between CPUs in NUMA.
+ *
+ * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ * may share units in the same large page. The returned configuration
+ * is guaranteed to have CPUs on different nodes on different large
+ * pages and >=75% usage of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ * returns the number of units to be allocated. -errno on failure.
+ */
+int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+ ssize_t *dyn_sizep, size_t *unit_sizep,
+ size_t lpage_size, int *unit_map,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ int group_cnt_max = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs;
+ unsigned int cpu, tcpu;
+ int group, unit;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of lpage_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, lpage_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ group_cnt_max = max(group_cnt_max, group_cnt[group]);
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group_cnt[group]; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 25%. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ *unit_sizep = alloc_size / best_upa;
- return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- reserved_size, dyn_size,
- pcpue_unit_size, pcpue_ptr, NULL);
+ /* assign units to cpus accordingly */
+ unit = 0;
+ for (group = 0; group_cnt[group]; group++) {
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ unit_map[cpu] = unit++;
+ unit = roundup(unit, best_upa);
+ }
+
+ return unit; /* unit contains aligned number of units */
+}
+
+struct pcpul_ent {
+ void *ptr;
+ void *map_addr;
+};
+
+static size_t pcpul_size;
+static size_t pcpul_lpage_size;
+static int pcpul_nr_lpages;
+static struct pcpul_ent *pcpul_map;
+
+static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+ unsigned int *cpup)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (unit_map[cpu] == unit) {
+ if (cpup)
+ *cpup = cpu;
+ return true;
+ }
+
+ return false;
+}
+
+static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+ size_t reserved_size, size_t dyn_size,
+ size_t unit_size, size_t lpage_size,
+ const int *unit_map, int nr_units)
+{
+ int width = 1, v = nr_units;
+ char empty_str[] = "--------";
+ int upl, lpl; /* units per lpage, lpage per line */
+ unsigned int cpu;
+ int lpage, unit;
+
+ while (v /= 10)
+ width++;
+ empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+
+ upl = max_t(int, lpage_size / unit_size, 1);
+ lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+
+ printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+ static_size, reserved_size, dyn_size, unit_size, lpage_size);
+
+ for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+ if (!(unit % upl)) {
+ if (!(lpage++ % lpl)) {
+ printk("\n");
+ printk("%spcpu-lpage: ", lvl);
+ } else
+ printk("| ");
+ }
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ printk("%0*d ", width, cpu);
+ else
+ printk("%s ", empty_str);
+ }
+ printk("\n");
+}
+
+/**
+ * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes
+ * @unit_size: unit size in bytes
+ * @lpage_size: the size of a large page
+ * @unit_map: cpu -> unit mapping
+ * @nr_units: the number of units
+ * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ * @free_fn: function to free percpu memory, @size <= lpage_size
+ * @map_fn: function to map percpu lpage, always called with lpage_size
+ *
+ * This allocator uses large page to build and map the first chunk.
+ * Unlike other helpers, the caller should always specify @dyn_size
+ * and @unit_size. These parameters along with @unit_map and
+ * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ * This two stage initialization is to allow arch code to evaluate the
+ * parameters before committing to it.
+ *
+ * Large pages are allocated as directed by @unit_map and other
+ * parameters and mapped to vmalloc space. Unused holes are returned
+ * to the page allocator. Note that these holes end up being actively
+ * mapped twice - once to the physical mapping and to the vmalloc area
+ * for the first percpu chunk. Depending on architecture, this might
+ * cause problem when changing page attributes of the returned area.
+ * These double mapped areas can be detected using
+ * pcpu_lpage_remapped().
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+ size_t dyn_size, size_t unit_size,
+ size_t lpage_size, const int *unit_map,
+ int nr_units,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_map_fn_t map_fn)
+{
+ static struct vm_struct vm;
+ size_t chunk_size = unit_size * nr_units;
+ size_t map_size;
+ unsigned int cpu;
+ ssize_t ret;
+ int i, j, unit;
+
+ pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+ unit_size, lpage_size, unit_map, nr_units);
+
+ BUG_ON(chunk_size % lpage_size);
+
+ pcpul_size = static_size + reserved_size + dyn_size;
+ pcpul_lpage_size = lpage_size;
+ pcpul_nr_lpages = chunk_size / lpage_size;
+
+ /* allocate pointer array and alloc large pages */
+ map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
+ pcpul_map = alloc_bootmem(map_size);
+
+ /* allocate all pages */
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ size_t offset = i * lpage_size;
+ int first_unit = offset / unit_size;
+ int last_unit = (offset + lpage_size - 1) / unit_size;
+ void *ptr;
+
+ /* find out which cpu is mapped to this unit */
+ for (unit = first_unit; unit <= last_unit; unit++)
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ goto found;
+ continue;
+ found:
+ ptr = alloc_fn(cpu, lpage_size);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate large page "
+ "for cpu%u\n", cpu);
+ goto enomem;
+ }
+
+ pcpul_map[i].ptr = ptr;
+ }
+
+ /* return unused holes */
+ for (unit = 0; unit < nr_units; unit++) {
+ size_t start = unit * unit_size;
+ size_t end = start + unit_size;
+ size_t off, next;
+
+ /* don't free used part of occupied unit */
+ if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+ start += pcpul_size;
+
+ /* unit can span more than one page, punch the holes */
+ for (off = start; off < end; off = next) {
+ void *ptr = pcpul_map[off / lpage_size].ptr;
+ next = min(roundup(off + 1, lpage_size), end);
+ if (ptr)
+ free_fn(ptr + off % lpage_size, next - off);
+ }
+ }
+
+ /* allocate address, map and copy */
+ vm.flags = VM_ALLOC;
+ vm.size = chunk_size;
+ vm_area_register_early(&vm, unit_size);
+
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ if (!pcpul_map[i].ptr)
+ continue;
+ pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+ map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+ }
+
+ for_each_possible_cpu(cpu)
+ memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+ static_size);
+
+ /* we're ready, commit */
+ pr_info("PERCPU: Remapped at %p with large pages, static data "
+ "%zu bytes\n", vm.addr, static_size);
+
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, vm.addr, unit_map);
+
+ /*
+ * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
+ * lpages are pushed to the end and trimmed.
+ */
+ for (i = 0; i < pcpul_nr_lpages - 1; i++)
+ for (j = i + 1; j < pcpul_nr_lpages; j++) {
+ struct pcpul_ent tmp;
+
+ if (!pcpul_map[j].ptr)
+ continue;
+ if (pcpul_map[i].ptr &&
+ pcpul_map[i].ptr < pcpul_map[j].ptr)
+ continue;
+
+ tmp = pcpul_map[i];
+ pcpul_map[i] = pcpul_map[j];
+ pcpul_map[j] = tmp;
+ }
+
+ while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+ pcpul_nr_lpages--;
+
+ return ret;
+
+enomem:
+ for (i = 0; i < pcpul_nr_lpages; i++)
+ if (pcpul_map[i].ptr)
+ free_fn(pcpul_map[i].ptr, lpage_size);
+ free_bootmem(__pa(pcpul_map), map_size);
+ return -ENOMEM;
+}
+
+/**
+ * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpul recycled area. This is
+ * used by pageattr to detect VM aliases and break up the pcpu large
+ * page mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used large
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_lpage_remapped(void *kaddr)
+{
+ unsigned long lpage_mask = pcpul_lpage_size - 1;
+ void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+ unsigned long offset = (unsigned long)kaddr & lpage_mask;
+ int left = 0, right = pcpul_nr_lpages - 1;
+ int pos;
+
+ /* pcpul in use at all? */
+ if (!pcpul_map)
+ return NULL;
+
+ /* okay, perform binary search */
+ while (left <= right) {
+ pos = (left + right) / 2;
+
+ if (pcpul_map[pos].ptr < lpage_addr)
+ left = pos + 1;
+ else if (pcpul_map[pos].ptr > lpage_addr)
+ right = pos - 1;
+ else
+ return pcpul_map[pos].map_addr + offset;
+ }
+
+ return NULL;
+}
+#endif
+
+/*
+ * Generic percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup. This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location. As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+ size_t static_size = __per_cpu_end - __per_cpu_start;
+ ssize_t unit_size;
+ unsigned long delta;
+ unsigned int cpu;
+
+ /*
+ * Always reserve area for module percpu variables. That's
+ * what the legacy allocator did.
+ */
+ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE);
+ if (unit_size < 0)
+ panic("Failed to initialized percpu areas.");
+
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu)
+ __per_cpu_offset[cpu] = delta + cpu * unit_size;
}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
- #include <linux/kmemleak.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
#include <linux/debugobjects.h>
*/
#define NR_KMEM_CACHE_CPU 100
-static DEFINE_PER_CPU(struct kmem_cache_cpu,
- kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
+ kmem_cache_cpu);
static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
*/
void kmem_cache_destroy(struct kmem_cache *s)
{
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
down_write(&slub_lock);
s->refcount--;
if (!s->refcount) {
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
+ void *ptr = NULL;
flags |= __GFP_COMP | __GFP_NOTRACK;
page = alloc_pages_node(node, flags, get_order(size));
if (page)
- return page_address(page);
- else
- return NULL;
+ ptr = page_address(page);
+
+ kmemleak_alloc(ptr, size, 1, flags);
+ return ptr;
}
#ifdef CONFIG_NUMA
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
+ kmemleak_free(x);
put_page(page);
return;
}