FB The frame buffer device is enabled.
HW Appropriate hardware is enabled.
IA-64 IA-64 architecture is enabled.
+ IMA Integrity measurement architecture is enabled.
IOSCHED More than one I/O scheduler is enabled.
IP_PNP IP DHCP, BOOTP, or RARP is enabled.
ISAPNP ISA PnP code is enabled.
Default: 64
hpet= [X86-32,HPET] option to control HPET usage
- Format: { enable (default) | disable | force }
+ Format: { enable (default) | disable | force |
+ verbose }
disable: disable HPET and use PIT instead
force: allow force enabled of undocumented chips (ICH4,
VIA, nVidia)
+ verbose: show contents of HPET registers during setup
com20020= [HW,NET] ARCnet - COM20020 chipset
Format:
hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
terminal devices. Valid values: 0..8
+ hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs.
+ If specified, z/VM IUCV HVC accepts connections
+ from listed z/VM user IDs only.
i8042.debug [HW] Toggle i8042 debug mode
i8042.direct [HW] Put keyboard port into non-translated mode
ihash_entries= [KNL]
Set number of hash buckets for inode cache.
+ ima_audit= [IMA]
+ Format: { "0" | "1" }
+ 0 -- integrity auditing messages. (Default)
+ 1 -- enable informational integrity auditing messages.
+
+ ima_hash= [IMA]
+ Formt: { "sha1" | "md5" }
+ default: "sha1"
+
in2000= [HW,SCSI]
See header of drivers/scsi/in2000.c.
memtest= [KNL,X86] Enable memtest
Format: <integer>
- range: 0,4 : pattern number
default : 0 <disable>
+ Specifies the number of memtest passes to be
+ performed. Each pass selects another test
+ pattern from a given set of patterns. Memtest
+ fills the memory with this pattern, validates
+ memory contents and reserves bad memory
+ regions that are detected.
meye.*= [HW] Set MotionEye Camera parameters
See Documentation/video4linux/meye.txt.
autoconfiguration.
Ranges are in pairs (memory base and size).
- dynamic_printk Enables pr_debug()/dev_dbg() calls if
- CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
- These can also be switched on/off via
- <debugfs>/dynamic_printk/modules
-
print-fatal-signals=
[KNL] debug: print fatal signals
print-fatal-signals=1: print segfault info to
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 29
-EXTRAVERSION = -rc6
-NAME = Erotic Pickled Herring
+EXTRAVERSION =
+NAME = Temporary Tasmanian Devil
# *DOCUMENTATION*
# To see a list of typical targets execute "make help"
endif
# Force gcc to behave correct even for buggy distributions
- # Arch Makefiles may override this setting
+ ifndef CONFIG_CC_STACKPROTECTOR
KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
+ endif
ifdef CONFIG_FRAME_POINTER
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
# disable pointer signed / unsigned warnings in gcc 4.0
KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
+# disable invalid "can't wrap" optimzations for signed / pointers
+KBUILD_CFLAGS += $(call cc-option,-fwrapv)
+
+# revert to pre-gcc-4.4 behaviour of .eh_frame
+KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm)
+
# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
# But warn user when we do so
warn-assign = \
# and if the SCM is know a tag from the SCM is appended.
# The appended tag is determined by the SCM used.
#
-# Currently, only git is supported.
-# Other SCMs can edit scripts/setlocalversion and add the appropriate
-# checks as needed.
+# .scmversion is used when generating rpm packages so we do not loose
+# the version information from the SCM when we do the build of the kernel
+# from the copied source
ifdef CONFIG_LOCALVERSION_AUTO
- _localver-auto = $(shell $(CONFIG_SHELL) \
- $(srctree)/scripts/setlocalversion $(srctree))
+
+ifeq ($(wildcard .scmversion),)
+ _localver-auto = $(shell $(CONFIG_SHELL) \
+ $(srctree)/scripts/setlocalversion $(srctree))
+else
+ _localver-auto = $(shell cat .scmversion 2> /dev/null)
+endif
+
localver-auto = $(LOCALVERSION)$(_localver-auto)
endif
cmd_depmod = \
if [ -r System.map -a -x $(DEPMOD) ]; then \
$(DEPMOD) -ae -F System.map \
- $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) -r) \
+ $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) ) \
$(KERNELRELEASE); \
fi
cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
last_cpu = cpu;
- irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+ cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
return 0;
}
seq_printf(p, "%10u ", kstat_irqs(irq));
#else
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[irq]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(irq, j));
#endif
seq_printf(p, " %14s", irq_desc[irq].chip->typename);
seq_printf(p, " %c%s",
seq_printf(p, "%3d: ", i);
for_each_present_cpu(cpu)
- seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, cpu));
seq_printf(p, " %10s", irq_desc[i].chip->name ? : "-");
seq_printf(p, " %s", action->name);
for (action = action->next; action; action = action->next)
.lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock),
};
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ /* We are not allocating bad_irq_desc.affinity or .pending_mask */
+ #error "ARM architecture does not support CONFIG_CPUMASK_OFFSTACK."
+ #endif
+
/*
* do_IRQ handles all hardware IRQ's. Decoded IRQs should not
* come via this function. Instead, they should provide their
irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE;
#ifdef CONFIG_SMP
- bad_irq_desc.affinity = CPU_MASK_ALL;
+ cpumask_setall(bad_irq_desc.affinity);
bad_irq_desc.cpu = smp_processor_id();
#endif
init_arch_irq();
struct irq_desc *desc = irq_desc + i;
if (desc->cpu == cpu) {
- unsigned int newcpu = any_online_cpu(desc->affinity);
-
- if (newcpu == NR_CPUS) {
+ unsigned int newcpu = cpumask_any_and(desc->affinity,
+ cpu_online_mask);
+ if (newcpu >= nr_cpu_ids) {
if (printk_ratelimit())
printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n",
i, cpu);
- cpus_setall(desc->affinity);
- newcpu = any_online_cpu(desc->affinity);
+ cpumask_setall(desc->affinity);
+ newcpu = cpumask_any_and(desc->affinity,
+ cpu_online_mask);
}
route_irq(desc, i, newcpu);
#endif
};
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ /* We are not allocating a variable-sized bad_irq_desc.affinity */
+ #error "Blackfin architecture does not support CONFIG_CPUMASK_OFFSTACK."
+ #endif
+
int show_interrupts(struct seq_file *p, void *v)
{
int i = *(loff_t *) v, j;
goto skip;
seq_printf(p, "%3d: ", i);
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
seq_printf(p, " %8s", irq_desc[i].chip->name);
seq_printf(p, " %s", action->name);
for (action = action->next; action; action = action->next)
#endif
generic_handle_irq(irq);
-#ifndef CONFIG_IPIPE /* Useless and bugous over the I-pipe: IRQs are threaded. */
- /* If we're the only interrupt running (ignoring IRQ15 which is for
- syscalls), lower our priority to IRQ14 so that softirqs run at
- that level. If there's another, lower-level interrupt, irq_exit
- will defer softirqs to that. */
+#ifndef CONFIG_IPIPE
+ /*
+ * If we're the only interrupt running (ignoring IRQ15 which
+ * is for syscalls), lower our priority to IRQ14 so that
+ * softirqs run at that level. If there's another,
+ * lower-level interrupt, irq_exit will defer softirqs to
+ * that. If the interrupt pipeline is enabled, we are already
+ * running at IRQ14 priority, so we don't need this code.
+ */
CSYNC();
pending = bfin_read_IPEND() & ~0x8000;
other_ints = pending & (pending - 1);
seq_printf(p, "%10u ", kstat_irqs(i));
#else
for_each_online_cpu(j) {
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
}
#endif
seq_printf(p, " %14s", irq_desc[i].chip->name);
void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
{
if (irq < NR_IRQS) {
- cpumask_copy(&irq_desc[irq].affinity,
+ cpumask_copy(irq_desc[irq].affinity,
cpumask_of(cpu_logical_id(hwid)));
irq_redir[irq] = (char) (redir & 0xff);
}
if (desc->status == IRQ_PER_CPU)
continue;
- if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask)
+ if (cpumask_any_and(irq_desc[irq].affinity, cpu_online_mask)
>= nr_cpu_ids) {
/*
* Save it for phase 2 processing
#include <linux/msi.h>
#include <linux/dmar.h>
#include <asm/smp.h>
-
-/*
- * Shifts for APIC-based data
- */
-
-#define MSI_DATA_VECTOR_SHIFT 0
-#define MSI_DATA_VECTOR(v) (((u8)v) << MSI_DATA_VECTOR_SHIFT)
-#define MSI_DATA_VECTOR_MASK 0xffffff00
-
-#define MSI_DATA_DELIVERY_SHIFT 8
-#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_SHIFT)
-#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_SHIFT)
-
-#define MSI_DATA_LEVEL_SHIFT 14
-#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
-#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
-
-#define MSI_DATA_TRIGGER_SHIFT 15
-#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
-#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
-
-/*
- * Shift/mask fields for APIC-based bus address
- */
-
-#define MSI_TARGET_CPU_SHIFT 4
-#define MSI_ADDR_HEADER 0xfee00000
-
-#define MSI_ADDR_DESTID_MASK 0xfff0000f
-#define MSI_ADDR_DESTID_CPU(cpu) ((cpu) << MSI_TARGET_CPU_SHIFT)
-
-#define MSI_ADDR_DESTMODE_SHIFT 2
-#define MSI_ADDR_DESTMODE_PHYS (0 << MSI_ADDR_DESTMODE_SHIFT)
-#define MSI_ADDR_DESTMODE_LOGIC (1 << MSI_ADDR_DESTMODE_SHIFT)
-
-#define MSI_ADDR_REDIRECTION_SHIFT 3
-#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
-#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+#include <asm/msidef.h>
static struct irq_chip ia64_msi_chip;
read_msi_msg(irq, &msg);
addr = msg.address_lo;
- addr &= MSI_ADDR_DESTID_MASK;
- addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+ addr &= MSI_ADDR_DEST_ID_MASK;
+ addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
msg.address_lo = addr;
data = msg.data;
msg.data = data;
write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+ cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
}
#endif /* CONFIG_SMP */
msg.address_hi = 0;
msg.address_lo =
MSI_ADDR_HEADER |
- MSI_ADDR_DESTMODE_PHYS |
+ MSI_ADDR_DEST_MODE_PHYS |
MSI_ADDR_REDIRECTION_CPU |
- MSI_ADDR_DESTID_CPU(dest_phys_id);
+ MSI_ADDR_DEST_ID_CPU(dest_phys_id);
msg.data =
MSI_DATA_TRIGGER_EDGE |
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DESTID_MASK;
- msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
dmar_msi_write(irq, &msg);
- irq_desc[irq].affinity = *mask;
+ cpumask_copy(irq_desc[irq].affinity, mask);
}
#endif /* CONFIG_SMP */
msg->address_hi = 0;
msg->address_lo =
MSI_ADDR_HEADER |
- MSI_ADDR_DESTMODE_PHYS |
+ MSI_ADDR_DEST_MODE_PHYS |
MSI_ADDR_REDIRECTION_CPU |
- MSI_ADDR_DESTID_CPU(dest);
+ MSI_ADDR_DEST_ID_CPU(dest);
msg->data =
MSI_DATA_TRIGGER_EDGE |
}
#ifdef CONFIG_SMP
-int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
+int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
{
int cpu_dest;
if (CHECK_IRQ_PER_CPU(irq)) {
/* Bad linux design decision. The mask has already
* been set; we must reset it */
- cpumask_setall(irq_desc[irq].affinity);
+ cpumask_setall(&irq_desc[irq].affinity);
return -EINVAL;
}
/* whatever mask they set, we just allow one CPU */
cpu_dest = first_cpu(*dest);
- *dest = cpumask_of_cpu(cpu_dest);
- return 0;
+ return cpu_dest;
}
static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
{
- if (cpu_check_affinity(irq, dest))
+ int cpu_dest;
+
+ cpu_dest = cpu_check_affinity(irq, dest);
+ if (cpu_dest < 0)
return;
- cpumask_copy(&irq_desc[irq].affinity, &cpumask_of_cpu(cpu_dest));
- cpumask_copy(irq_desc[irq].affinity, dest);
++ cpumask_copy(&irq_desc[irq].affinity, dest);
}
#endif
seq_printf(p, "%3d: ", i);
#ifdef CONFIG_SMP
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#else
seq_printf(p, "%10u ", kstat_irqs(i));
#endif
unsigned long txn_affinity_addr(unsigned int irq, int cpu)
{
#ifdef CONFIG_SMP
- cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+ cpumask_copy(&irq_desc[irq].affinity, cpumask_of(cpu));
#endif
return per_cpu(cpu_data, cpu).txn_addr;
irq = eirr_to_irq(eirr_val);
#ifdef CONFIG_SMP
- cpumask_copy(&dest, irq_desc[irq].affinity);
+ cpumask_copy(&dest, &irq_desc[irq].affinity);
if (CHECK_IRQ_PER_CPU(irq_desc[irq].status) &&
!cpu_isset(smp_processor_id(), dest)) {
int cpu = first_cpu(dest);
seq_printf(p, "%3d: ", i);
#ifdef CONFIG_SMP
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#else
seq_printf(p, "%10u ", kstat_irqs(i));
#endif /* CONFIG_SMP */
if (irq_desc[irq].status & IRQ_PER_CPU)
continue;
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpumask_and(&mask, irq_desc[irq].affinity, &map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
seq_printf(p, "%10u ", kstat_irqs(i));
#else
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
#endif
seq_printf(p, " %9s", irq_desc[i].chip->typename);
seq_printf(p, " %s", action->name);
#ifdef CONFIG_SMP
static int irq_choose_cpu(unsigned int virt_irq)
{
- cpumask_t mask = irq_desc[virt_irq].affinity;
+ cpumask_t mask;
int cpuid;
+ cpumask_copy(&mask, irq_desc[virt_irq].affinity);
if (cpus_equal(mask, CPU_MASK_ALL)) {
static int irq_rover;
static DEFINE_SPINLOCK(irq_rover_lock);
sun4u_irq_enable(virt_irq);
}
+/* Don't do anything. The desc->status check for IRQ_DISABLED in
+ * handler_irq() will skip the handler call and that will leave the
+ * interrupt in the sent state. The next ->enable() call will hit the
+ * ICLR register to reset the state machine.
+ *
+ * This scheme is necessary, instead of clearing the Valid bit in the
+ * IMAP register, to handle the case of IMAP registers being shared by
+ * multiple INOs (and thus ICLR registers). Since we use a different
+ * virtual IRQ for each shared IMAP instance, the generic code thinks
+ * there is only one user so it prematurely calls ->disable() on
+ * free_irq().
+ *
+ * We have to provide an explicit ->disable() method instead of using
+ * NULL to get the default. The reason is that if the generic code
+ * sees that, it also hooks up a default ->shutdown method which
+ * invokes ->mask() which we do not want. See irq_chip_set_defaults().
+ */
static void sun4u_irq_disable(unsigned int virt_irq)
{
- struct irq_handler_data *data = get_irq_chip_data(virt_irq);
-
- if (likely(data)) {
- unsigned long imap = data->imap;
- unsigned long tmp = upa_readq(imap);
-
- tmp &= ~IMAP_VALID;
- upa_writeq(tmp, imap);
- }
}
static void sun4u_irq_eoi(unsigned int virt_irq)
desc = irq_desc + virt_irq;
- desc->handle_irq(virt_irq, desc);
+ if (!(desc->status & IRQ_DISABLED))
+ desc->handle_irq(virt_irq, desc);
bucket_pa = next_pa;
}
!(irq_desc[irq].status & IRQ_PER_CPU)) {
if (irq_desc[irq].chip->set_affinity)
irq_desc[irq].chip->set_affinity(irq,
- &irq_desc[irq].affinity);
+ irq_desc[irq].affinity);
}
spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
}
#include <linux/clocksource.h>
#include <linux/of_device.h>
#include <linux/platform_device.h>
+#include <linux/irq.h>
#include <asm/oplib.h>
#include <asm/timer.h>
-#include <asm/irq.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/starfire.h>
irq_enter();
- kstat_this_cpu.irqs[0]++;
+ kstat_incr_irqs_this_cpu(0, irq_to_desc(0));
if (unlikely(!evt->event_handler)) {
printk(KERN_WARNING
config 64BIT
bool "64-bit kernel" if ARCH = "x86"
default ARCH = "x86_64"
- help
+ ---help---
Say yes to build a 64-bit kernel - formerly known as x86_64
Say no to build a 32-bit kernel - formerly known as i386
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACE_MCOUNT_TEST
- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
- select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_KVM
+ select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select USER_STACKTRACE_SUPPORT
+ select HAVE_KERNEL_GZIP
+ select HAVE_KERNEL_BZIP2
+ select HAVE_KERNEL_LZMA
config ARCH_DEFCONFIG
string
def_bool y
config HAVE_SETUP_PER_CPU_AREA
- def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
+ def_bool y
+
+ config HAVE_DYNAMIC_PER_CPU_AREA
+ def_bool y
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP
config ARCH_HIBERNATION_POSSIBLE
def_bool y
- depends on !SMP || !X86_VOYAGER
config ARCH_SUSPEND_POSSIBLE
def_bool y
- depends on !X86_VOYAGER
config ZONE_DMA32
bool
bool
default y
+config GENERIC_HARDIRQS_NO__DO_IRQ
+ def_bool y
+
config GENERIC_IRQ_PROBE
bool
default y
depends on GENERIC_HARDIRQS && SMP
default y
- config X86_SMP
- bool
- depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
- default y
-
config USE_GENERIC_SMP_HELPERS
def_bool y
depends on SMP
config X86_HT
bool
depends on SMP
- depends on (X86_32 && !X86_VOYAGER) || X86_64
- default y
-
- config X86_BIOS_REBOOT
- bool
- depends on !X86_VOYAGER
default y
config X86_TRAMPOLINE
bool
- depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
+ depends on SMP || (64BIT && ACPI_SLEEP)
default y
+ config X86_32_LAZY_GS
+ def_bool y
+ depends on X86_32 && !CC_STACKPROTECTOR
+
config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
If you don't know what to do here, say N.
- config X86_HAS_BOOT_CPU_ID
- def_bool y
- depends on X86_VOYAGER
+ config X86_X2APIC
+ bool "Support x2apic"
+ depends on X86_LOCAL_APIC && X86_64
+ ---help---
+ This enables x2apic support on CPUs that have this feature.
+
+ This allows 32-bit apic IDs (so it can support very large systems),
+ and accesses the local apic via MSRs not via mmio.
+
+ ( On certain CPU models you may need to enable INTR_REMAP too,
+ to get functional x2apic mode. )
+
+ If you don't know what to do here, say N.
config SPARSE_IRQ
bool "Support sparse irq numbering"
depends on PCI_MSI || HT_IRQ
- help
+ ---help---
This enables support for sparse irqs. This is useful for distro
kernels that want to define a high CONFIG_NR_CPUS value but still
want to have low kernel memory footprint on smaller machines.
bool "Move irq desc when changing irq smp_affinity"
depends on SPARSE_IRQ && NUMA
default n
- help
+ ---help---
This enables moving irq_desc to cpu/node that irq will use handled.
If you don't know what to do here, say N.
- config X86_FIND_SMP_CONFIG
- def_bool y
- depends on X86_MPPARSE || X86_VOYAGER
-
config X86_MPPARSE
bool "Enable MPS table" if ACPI
default y
depends on X86_LOCAL_APIC
- help
+ ---help---
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
- choice
- prompt "Subarchitecture Type"
- default X86_PC
+ config X86_BIGSMP
+ bool "Support for big SMP systems with more than 8 CPUs"
+ depends on X86_32 && SMP
+ ---help---
+ This option is needed for the systems that have more than 8 CPUs
- config X86_PC
- bool "PC-compatible"
- help
- Choose this option if your computer is a standard PC or compatible.
+ if X86_32
+ config X86_EXTENDED_PLATFORM
+ bool "Support for extended (non-PC) x86 platforms"
+ default y
+ ---help---
+ If you disable this option then the kernel will only support
+ standard PC platforms. (which covers the vast majority of
+ systems out there.)
+
+ If you enable this option then you'll be able to select support
+ for the following (non-PC) 32 bit x86 platforms:
+ AMD Elan
+ NUMAQ (IBM/Sequent)
+ RDC R-321x SoC
+ SGI 320/540 (Visual Workstation)
+ Summit/EXA (IBM x440)
+ Unisys ES7000 IA32 series
+
+ If you have one of these systems, or if you want to build a
+ generic distribution kernel, say Y here - otherwise say N.
+ endif
+
+ if X86_64
+ config X86_EXTENDED_PLATFORM
+ bool "Support for extended (non-PC) x86 platforms"
+ default y
+ ---help---
+ If you disable this option then the kernel will only support
+ standard PC platforms. (which covers the vast majority of
+ systems out there.)
+
+ If you enable this option then you'll be able to select support
+ for the following (non-PC) 64 bit x86 platforms:
+ ScaleMP vSMP
+ SGI Ultraviolet
+
+ If you have one of these systems, or if you want to build a
+ generic distribution kernel, say Y here - otherwise say N.
+ endif
+ # This is an alphabetically sorted list of 64 bit extended platforms
+ # Please maintain the alphabetic order if and when there are additions
+
+ config X86_VSMP
+ bool "ScaleMP vSMP"
+ select PARAVIRT
+ depends on X86_64 && PCI
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
+ supposed to run on these EM64T-based machines. Only choose this option
+ if you have one of these machines.
+
+ config X86_UV
+ bool "SGI Ultraviolet"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ select X86_X2APIC
+ ---help---
+ This option is needed in order to support SGI Ultraviolet systems.
+ If you don't have one of these, you should say N here.
+
+ # Following is an alphabetically sorted list of 32 bit extended platforms
+ # Please maintain the alphabetic order if and when there are additions
config X86_ELAN
bool "AMD Elan"
depends on X86_32
- help
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
Select this for an AMD Elan processor.
Do not use this option for K6/Athlon/Opteron processors!
If unsure, choose "PC-compatible" instead.
- config X86_VOYAGER
- bool "Voyager (NCR)"
- depends on X86_32 && (SMP || BROKEN) && !PCI
- help
- Voyager is an MCA-based 32-way capable SMP architecture proprietary
- to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
-
- *** WARNING ***
-
- If you do not specifically know you have a Voyager based machine,
- say N here, otherwise the kernel you build will not be bootable.
-
- config X86_GENERICARCH
- bool "Generic architecture"
+ config X86_RDC321X
+ bool "RDC R-321x SoC"
depends on X86_32
- help
- This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
+ depends on X86_EXTENDED_PLATFORM
+ select M486
+ select X86_REBOOTFIXUPS
+ ---help---
+ This option is needed for RDC R-321x system-on-chip, also known
+ as R-8610-(G).
+ If you don't have one of these chips, you should say N here.
+
+ config X86_32_NON_STANDARD
+ bool "Support non-standard 32-bit SMP architectures"
+ depends on X86_32 && SMP
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
subarchitectures. It is intended for a generic binary kernel.
if you select them all, kernel will probe it one by one. and will
fallback to default.
- if X86_GENERICARCH
+ # Alphabetically sorted list of Non standard 32 bit platforms
config X86_NUMAQ
bool "NUMAQ (IBM/Sequent)"
- depends on SMP && X86_32 && PCI && X86_MPPARSE
+ depends on X86_32_NON_STANDARD
select NUMA
- help
+ select X86_MPPARSE
+ ---help---
This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
NUMA multiquad box. This changes the way that processors are
bootstrapped, and uses Clustered Logical APIC addressing mode instead
of Flat Logical. You will need a new lynxer.elf file to flash your
- config X86_SUMMIT
- bool "Summit/EXA (IBM x440)"
- depends on X86_32 && SMP
- help
- This option is needed for IBM systems that use the Summit/EXA chipset.
- In particular, it is needed for the x440.
-
- config X86_ES7000
- bool "Support for Unisys ES7000 IA32 series"
- depends on X86_32 && SMP
- help
- Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
- supposed to run on an IA32-based Unisys ES7000 system.
-
- config X86_BIGSMP
- bool "Support for big SMP systems with more than 8 CPUs"
- depends on X86_32 && SMP
- help
- This option is needed for the systems that have more than 8 CPUs
- and if the system is not of any sub-arch type above.
-
- endif
-
- config X86_VSMP
- bool "Support for ScaleMP vSMP"
- select PARAVIRT
- depends on X86_64 && PCI
- help
- Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
- supposed to run on these EM64T-based machines. Only choose this option
- if you have one of these machines.
-
- endchoice
-
config X86_VISWS
bool "SGI 320/540 (Visual Workstation)"
- depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
- help
+ depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
+ depends on X86_32_NON_STANDARD
+ ---help---
The SGI Visual Workstation series is an IA32-based workstation
based on SGI systems chips with some legacy PC hardware attached.
A kernel compiled for the Visual Workstation will run on general
PCs as well. See <file:Documentation/sgi-visws.txt> for details.
- config X86_RDC321X
- bool "RDC R-321x SoC"
- depends on X86_32
- select M486
- select X86_REBOOTFIXUPS
- help
- This option is needed for RDC R-321x system-on-chip, also known
- as R-8610-(G).
- If you don't have one of these chips, you should say N here.
+ config X86_SUMMIT
+ bool "Summit/EXA (IBM x440)"
+ depends on X86_32_NON_STANDARD
+ ---help---
+ This option is needed for IBM systems that use the Summit/EXA chipset.
+ In particular, it is needed for the x440.
+
+ config X86_ES7000
+ bool "Unisys ES7000 IA32 series"
+ depends on X86_32_NON_STANDARD && X86_BIGSMP
+ ---help---
+ Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
+ supposed to run on an IA32-based Unisys ES7000 system.
config SCHED_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
depends on X86
- help
+ ---help---
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
caller function. This provides more accurate wchan values,
menuconfig PARAVIRT_GUEST
bool "Paravirtualized guest support"
- help
+ ---help---
Say Y here to get to see options related to running Linux under
various hypervisors. This option alone does not add any kernel code.
bool "VMI Guest support"
select PARAVIRT
depends on X86_32
- depends on !X86_VOYAGER
- help
+ ---help---
VMI provides a paravirtualized interface to the VMware ESX server
(it could be used by other hypervisors in theory too, but is not
at the moment), by linking the kernel to a GPL-ed ROM module
bool "KVM paravirtualized clock"
select PARAVIRT
select PARAVIRT_CLOCK
- depends on !X86_VOYAGER
- help
+ ---help---
Turning on this option will allow you to run a paravirtualized clock
when running over the KVM hypervisor. Instead of relying on a PIT
(or probably other) emulation by the underlying device model, the host
config KVM_GUEST
bool "KVM Guest support"
select PARAVIRT
- depends on !X86_VOYAGER
- help
- This option enables various optimizations for running under the KVM
- hypervisor.
+ ---help---
+ This option enables various optimizations for running under the KVM
+ hypervisor.
source "arch/x86/lguest/Kconfig"
config PARAVIRT
bool "Enable paravirtualization code"
- depends on !X86_VOYAGER
- help
+ ---help---
This changes the kernel so it can modify itself when it is run
under a hypervisor, potentially improving performance significantly
over full virtualization. However, when run without a hypervisor
endif
config PARAVIRT_DEBUG
- bool "paravirt-ops debugging"
- depends on PARAVIRT && DEBUG_KERNEL
- help
- Enable to debug paravirt_ops internals. Specifically, BUG if
- a paravirt_op is missing when it is called.
+ bool "paravirt-ops debugging"
+ depends on PARAVIRT && DEBUG_KERNEL
+ ---help---
+ Enable to debug paravirt_ops internals. Specifically, BUG if
+ a paravirt_op is missing when it is called.
config MEMTEST
bool "Memtest"
- help
+ ---help---
This option adds a kernel parameter 'memtest', which allows memtest
to be set.
- memtest=0, mean disabled; -- default
- memtest=1, mean do 1 test pattern;
- ...
- memtest=4, mean do 4 test patterns.
+ memtest=0, mean disabled; -- default
+ memtest=1, mean do 1 test pattern;
+ ...
+ memtest=4, mean do 4 test patterns.
If you are unsure how to answer this question, answer N.
config X86_SUMMIT_NUMA
def_bool y
- depends on X86_32 && NUMA && X86_GENERICARCH
+ depends on X86_32 && NUMA && X86_32_NON_STANDARD
config X86_CYCLONE_TIMER
def_bool y
- depends on X86_GENERICARCH
+ depends on X86_32_NON_STANDARD
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
def_bool X86_64
prompt "HPET Timer Support" if X86_32
- help
- Use the IA-PC HPET (High Precision Event Timer) to manage
- time in preference to the PIT and RTC, if a HPET is
- present.
- HPET is the next generation timer replacing legacy 8254s.
- The HPET provides a stable time base on SMP
- systems, unlike the TSC, but it is more expensive to access,
- as it is off-chip. You can find the HPET spec at
- <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
+ ---help---
+ Use the IA-PC HPET (High Precision Event Timer) to manage
+ time in preference to the PIT and RTC, if a HPET is
+ present.
+ HPET is the next generation timer replacing legacy 8254s.
+ The HPET provides a stable time base on SMP
+ systems, unlike the TSC, but it is more expensive to access,
+ as it is off-chip. You can find the HPET spec at
+ <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
- You can safely choose Y here. However, HPET will only be
- activated if the platform and the BIOS support this feature.
- Otherwise the 8254 will be used for timing services.
+ You can safely choose Y here. However, HPET will only be
+ activated if the platform and the BIOS support this feature.
+ Otherwise the 8254 will be used for timing services.
- Choose N to continue using the legacy 8254 timer.
+ Choose N to continue using the legacy 8254 timer.
config HPET_EMULATE_RTC
def_bool y
config DMI
default y
bool "Enable DMI scanning" if EMBEDDED
- help
+ ---help---
Enabled scanning of DMI to identify machine quirks. Say Y
here unless you have verified that your setup is not
affected by entries in the DMI blacklist. Required by PNP
select SWIOTLB
select AGP
depends on X86_64 && PCI
- help
+ ---help---
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
sound, many IDE/SATA chipsets and some other devices.
bool "IBM Calgary IOMMU support"
select SWIOTLB
depends on X86_64 && PCI && EXPERIMENTAL
- help
+ ---help---
Support for hardware IOMMUs in IBM's xSeries x366 and x460
systems. Needed to run systems with more than 3GB of memory
properly with 32-bit PCI devices that do not support DAC
def_bool y
prompt "Should Calgary be enabled by default?"
depends on CALGARY_IOMMU
- help
+ ---help---
Should Calgary be enabled by default? if you choose 'y', Calgary
will be used (if it exists). If you choose 'n', Calgary will not be
used even if it exists. If you choose 'n' and would like to use
select SWIOTLB
select PCI_MSI
depends on X86_64 && PCI && ACPI
- help
+ ---help---
With this option you can enable support for AMD IOMMU hardware in
your system. An IOMMU is a hardware component which provides
remapping of DMA memory accesses from devices. With an AMD IOMMU you
bool "Export AMD IOMMU statistics to debugfs"
depends on AMD_IOMMU
select DEBUG_FS
- help
+ ---help---
This option enables code in the AMD IOMMU driver to collect various
statistics about whats happening in the driver and exports that
information to userspace via debugfs.
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
def_bool y if X86_64
- help
+ ---help---
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU (e.g. the current generation
of Intel's x86-64 CPUs). Using this PCI devices which can only
depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
select CPUMASK_OFFSTACK
default n
- help
+ ---help---
Configure maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
default "4096" if MAXSMP
default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
default "8" if SMP
- help
+ ---help---
This allows you to specify the maximum number of CPUs which this
kernel will support. The maximum supported value is 512 and the
minimum value which makes sense is 2.
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
depends on X86_HT
- help
+ ---help---
SMT scheduler support improves the CPU scheduler's decision making
when dealing with Intel Pentium 4 chips with HyperThreading at a
cost of slightly increased overhead in some places. If unsure say
def_bool y
prompt "Multi-core scheduler support"
depends on X86_HT
- help
+ ---help---
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
config X86_UP_APIC
bool "Local APIC support on uniprocessors"
- depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
- help
+ depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+ ---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
system which has a processor with a local APIC, you can say Y here to
config X86_UP_IOAPIC
bool "IO-APIC support on uniprocessors"
depends on X86_UP_APIC
- help
+ ---help---
An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
SMP-capable replacement for PC-style interrupt controllers. Most
SMP systems and many recent uniprocessor systems have one.
config X86_LOCAL_APIC
def_bool y
- depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+ depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
config X86_IO_APIC
def_bool y
- depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+ depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
config X86_VISWS_APIC
def_bool y
bool "Reroute for broken boot IRQs"
default n
depends on X86_IO_APIC
- help
+ ---help---
This option enables a workaround that fixes a source of
spurious interrupts. This is recommended when threaded
interrupt handling is used on systems where the generation of
config X86_MCE
bool "Machine Check Exception"
- depends on !X86_VOYAGER
---help---
Machine Check Exception support allows the processor to notify the
kernel if it detects a problem (e.g. overheating, component failure).
def_bool y
prompt "Intel MCE features"
depends on X86_64 && X86_MCE && X86_LOCAL_APIC
- help
+ ---help---
Additional support for intel specific MCE features such as
the thermal monitor.
def_bool y
prompt "AMD MCE features"
depends on X86_64 && X86_MCE && X86_LOCAL_APIC
- help
+ ---help---
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE
- help
+ ---help---
Enabling this feature starts a timer that triggers every 5 seconds which
will look at the machine check registers to see if anything happened.
Non-fatal problems automatically get corrected (but still logged).
config X86_MCE_P4THERMAL
bool "check for P4 thermal throttling interrupt."
depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
- help
+ ---help---
Enabling this feature will cause a message to be printed when the P4
enters thermal throttling.
bool "Enable VM86 support" if EMBEDDED
default y
depends on X86_32
- help
- This option is required by programs like DOSEMU to run 16-bit legacy
+ ---help---
+ This option is required by programs like DOSEMU to run 16-bit legacy
code on X86 processors. It also may be needed by software like
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
+ XFree86 to initialize some video cards via BIOS. Disabling this
+ option saves about 6k.
config TOSHIBA
tristate "Toshiba Laptop support"
module will be called microcode.
config MICROCODE_INTEL
- bool "Intel microcode patch loading support"
- depends on MICROCODE
- default MICROCODE
- select FW_LOADER
- --help---
- This options enables microcode patch loading support for Intel
- processors.
-
- For latest news and information on obtaining all the required
- Intel ingredients for this driver, check:
- <http://www.urbanmyth.org/microcode/>.
+ bool "Intel microcode patch loading support"
+ depends on MICROCODE
+ default MICROCODE
+ select FW_LOADER
+ ---help---
+ This options enables microcode patch loading support for Intel
+ processors.
+
+ For latest news and information on obtaining all the required
+ Intel ingredients for this driver, check:
+ <http://www.urbanmyth.org/microcode/>.
config MICROCODE_AMD
- bool "AMD microcode patch loading support"
- depends on MICROCODE
- select FW_LOADER
- --help---
- If you select this option, microcode patch loading support for AMD
- processors will be enabled.
+ bool "AMD microcode patch loading support"
+ depends on MICROCODE
+ select FW_LOADER
+ ---help---
+ If you select this option, microcode patch loading support for AMD
+ processors will be enabled.
- config MICROCODE_OLD_INTERFACE
+ config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
- help
+ ---help---
This device gives privileged processes access to the x86
Model-Specific Registers (MSRs). It is a character device with
major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
config X86_CPUID
tristate "/dev/cpu/*/cpuid - CPU information support"
- help
+ ---help---
This device gives processes access to the x86 CPUID instruction to
be executed on a specific processor. It is a character device
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
config HIGHMEM4G
bool "4GB"
depends on !X86_NUMAQ
- help
+ ---help---
Select this if you have a 32-bit processor and between 1 and 4
gigabytes of physical RAM.
bool "64GB"
depends on !M386 && !M486
select X86_PAE
- help
+ ---help---
Select this if you have a 32-bit processor and more than 4
gigabytes of physical RAM.
prompt "Memory split" if EMBEDDED
default VMSPLIT_3G
depends on X86_32
- help
+ ---help---
Select the desired split between kernel and user memory.
If the address range available to the kernel is less than the
config X86_PAE
bool "PAE (Physical Address Extension) Support"
depends on X86_32 && !HIGHMEM4G
- help
+ ---help---
PAE is required for NX support, and furthermore enables
larger swapspace support for non-overcommit purposes. It
has the cost of more pagetable lookup overhead, and also
consumes more pagetable space per process.
config ARCH_PHYS_ADDR_T_64BIT
- def_bool X86_64 || X86_PAE
+ def_bool X86_64 || X86_PAE
config DIRECT_GBPAGES
bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
default y
depends on X86_64
- help
+ ---help---
Allow the kernel linear mapping to use 1GB pages on CPUs that
support it. This can improve the kernel's performance a tiny bit by
reducing TLB pressure. If in doubt, say "Y".
bool "Numa Memory Allocation and Scheduler Support"
depends on SMP
depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
- default n if X86_PC
default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
- help
+ ---help---
Enable NUMA (Non Uniform Memory Access) support.
The kernel will try to allocate memory used by a CPU on the
def_bool y
prompt "Old style AMD Opteron NUMA detection"
depends on X86_64 && NUMA && PCI
- help
- Enable K8 NUMA node topology detection. You should say Y here if
- you have a multi processor AMD K8 system. This uses an old
- method to read the NUMA configuration directly from the builtin
- Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
- instead, which also takes priority if both are compiled in.
+ ---help---
+ Enable K8 NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD K8 system. This uses an old
+ method to read the NUMA configuration directly from the builtin
+ Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+ instead, which also takes priority if both are compiled in.
config X86_64_ACPI_NUMA
def_bool y
prompt "ACPI NUMA detection"
depends on X86_64 && NUMA && ACPI && PCI
select ACPI_NUMA
- help
+ ---help---
Enable ACPI SRAT based node topology detection.
# Some NUMA nodes have memory ranges that span
config NUMA_EMU
bool "NUMA emulation"
depends on X86_64 && NUMA
- help
+ ---help---
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.
default "4" if X86_NUMAQ
default "3"
depends on NEED_MULTIPLE_NODES
- help
+ ---help---
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accomodate various tables.
- config HAVE_ARCH_BOOTMEM_NODE
+ config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
+ depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
config HIGHPTE
bool "Allocate 3rd-level pagetables from highmem"
depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
- help
+ ---help---
The VM uses one page table entry for each page of physical memory.
For systems with a lot of RAM, this can be wasteful of precious
low memory. Setting this option will put user-space page table
entries in high memory.
config X86_CHECK_BIOS_CORRUPTION
- bool "Check for low memory corruption"
- help
- Periodically check for memory corruption in low memory, which
- is suspected to be caused by BIOS. Even when enabled in the
- configuration, it is disabled at runtime. Enable it by
- setting "memory_corruption_check=1" on the kernel command
- line. By default it scans the low 64k of memory every 60
- seconds; see the memory_corruption_check_size and
- memory_corruption_check_period parameters in
- Documentation/kernel-parameters.txt to adjust this.
-
- When enabled with the default parameters, this option has
- almost no overhead, as it reserves a relatively small amount
- of memory and scans it infrequently. It both detects corruption
- and prevents it from affecting the running system.
-
- It is, however, intended as a diagnostic tool; if repeatable
- BIOS-originated corruption always affects the same memory,
- you can use memmap= to prevent the kernel from using that
- memory.
+ bool "Check for low memory corruption"
+ ---help---
+ Periodically check for memory corruption in low memory, which
+ is suspected to be caused by BIOS. Even when enabled in the
+ configuration, it is disabled at runtime. Enable it by
+ setting "memory_corruption_check=1" on the kernel command
+ line. By default it scans the low 64k of memory every 60
+ seconds; see the memory_corruption_check_size and
+ memory_corruption_check_period parameters in
+ Documentation/kernel-parameters.txt to adjust this.
+
+ When enabled with the default parameters, this option has
+ almost no overhead, as it reserves a relatively small amount
+ of memory and scans it infrequently. It both detects corruption
+ and prevents it from affecting the running system.
+
+ It is, however, intended as a diagnostic tool; if repeatable
+ BIOS-originated corruption always affects the same memory,
+ you can use memmap= to prevent the kernel from using that
+ memory.
config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
- bool "Set the default setting of memory_corruption_check"
+ bool "Set the default setting of memory_corruption_check"
depends on X86_CHECK_BIOS_CORRUPTION
default y
- help
- Set whether the default state of memory_corruption_check is
- on or off.
+ ---help---
+ Set whether the default state of memory_corruption_check is
+ on or off.
config X86_RESERVE_LOW_64K
- bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
+ bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
default y
- help
- Reserve the first 64K of physical RAM on BIOSes that are known
- to potentially corrupt that memory range. A numbers of BIOSes are
- known to utilize this area during suspend/resume, so it must not
- be used by the kernel.
+ ---help---
+ Reserve the first 64K of physical RAM on BIOSes that are known
+ to potentially corrupt that memory range. A numbers of BIOSes are
+ known to utilize this area during suspend/resume, so it must not
+ be used by the kernel.
- Set this to N if you are absolutely sure that you trust the BIOS
- to get all its memory reservations and usages right.
+ Set this to N if you are absolutely sure that you trust the BIOS
+ to get all its memory reservations and usages right.
- If you have doubts about the BIOS (e.g. suspend/resume does not
- work or there's kernel crashes after certain hardware hotplug
- events) and it's not AMI or Phoenix, then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
- corruption patterns.
+ If you have doubts about the BIOS (e.g. suspend/resume does not
+ work or there's kernel crashes after certain hardware hotplug
+ events) and it's not AMI or Phoenix, then you might want to enable
+ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
+ corruption patterns.
- Say Y if unsure.
+ Say Y if unsure.
config MATH_EMULATION
bool
def_bool y
prompt "MTRR cleanup support"
depends on MTRR
- help
+ ---help---
Convert MTRR layout from continuous to discrete, so X drivers can
add writeback entries.
range 0 1
default "0"
depends on MTRR_SANITIZER
- help
+ ---help---
Enable mtrr cleanup default value
config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
range 0 7
default "1"
depends on MTRR_SANITIZER
- help
+ ---help---
mtrr cleanup spare entries default, it can be changed via
mtrr_spare_reg_nr=N on the kernel command line.
bool
prompt "x86 PAT support"
depends on MTRR
- help
+ ---help---
Use PAT attributes to setup page level cache control.
PATs are the modern equivalents of MTRRs and are much more
bool "EFI runtime service support"
depends on ACPI
---help---
- This enables the kernel to use EFI runtime services that are
- available (such as the EFI variable services).
+ This enables the kernel to use EFI runtime services that are
+ available (such as the EFI variable services).
- This option is only useful on systems that have EFI firmware.
- In addition, you should use the latest ELILO loader available
- at <http://elilo.sourceforge.net> in order to take advantage
- of EFI runtime services. However, even with this option, the
- resultant kernel should continue to boot on existing non-EFI
- platforms.
+ This option is only useful on systems that have EFI firmware.
+ In addition, you should use the latest ELILO loader available
+ at <http://elilo.sourceforge.net> in order to take advantage
+ of EFI runtime services. However, even with this option, the
+ resultant kernel should continue to boot on existing non-EFI
+ platforms.
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
- help
+ ---help---
This kernel feature is useful for number crunching applications
that may need to compute untrusted bytecode during their
execution. By using pipes or other transports made available to
If unsure, say Y. Only embedded should say N here.
+ config CC_STACKPROTECTOR_ALL
+ bool
+
config CC_STACKPROTECTOR
bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- depends on X86_64 && EXPERIMENTAL && BROKEN
- help
- This option turns on the -fstack-protector GCC feature. This
- feature puts, at the beginning of critical functions, a canary
- value on the stack just before the return address, and validates
+ select CC_STACKPROTECTOR_ALL
+ ---help---
+ This option turns on the -fstack-protector GCC feature. This
+ feature puts, at the beginning of functions, a canary value on
+ the stack just before the return address, and validates
the value just before actually returning. Stack based buffer
overflows (that need to overwrite this return address) now also
overwrite the canary, which gets detected and the attack is then
This feature requires gcc version 4.2 or above, or a distribution
gcc with the feature backported. Older versions are automatically
- detected and for those versions, this configuration option is ignored.
-
- config CC_STACKPROTECTOR_ALL
- bool "Use stack-protector for all functions"
- depends on CC_STACKPROTECTOR
- help
- Normally, GCC only inserts the canary value protection for
- functions that use large-ish on-stack buffers. By enabling
- this option, GCC will be asked to do this for ALL functions.
+ detected and for those versions, this configuration option is
+ ignored. (and a warning is printed during bootup)
source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
- depends on X86_BIOS_REBOOT
- help
+ ---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
but it is independent of the system firmware. And like a reboot
config CRASH_DUMP
bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
- help
+ ---help---
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
which are loaded in the main kernel with kexec-tools into
bool "kexec jump (EXPERIMENTAL)"
depends on EXPERIMENTAL
depends on KEXEC && HIBERNATION && X86_32
- help
+ ---help---
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
default "0x1000000" if X86_NUMAQ
default "0x200000" if X86_64
default "0x100000"
- help
+ ---help---
This gives the physical address where the kernel is loaded.
If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
config RELOCATABLE
bool "Build a relocatable kernel (EXPERIMENTAL)"
depends on EXPERIMENTAL
- help
+ ---help---
This builds a kernel image that retains relocation information
so it can be loaded someplace besides the default 1MB.
The relocations tend to make the kernel binary about 10% larger,
default "0x100000" if X86_32
default "0x200000" if X86_64
range 0x2000 0x400000
- help
+ ---help---
This value puts the alignment restrictions on physical address
where kernel is loaded and run from. Kernel is compiled for an
address which meets above alignment restriction.
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
- depends on SMP && HOTPLUG && !X86_VOYAGER
+ depends on SMP && HOTPLUG
---help---
Say Y here to allow turning CPUs off and on. CPUs can be
controlled through /sys/devices/system/cpu.
def_bool y
prompt "Compat VDSO support"
depends on X86_32 || IA32_EMULATION
- help
+ ---help---
Map the 32-bit VDSO to the predictable old-style address too.
---help---
Say N here if you are running a sufficiently recent glibc
config CMDLINE_BOOL
bool "Built-in kernel command line"
default n
- help
+ ---help---
Allow for specifying boot arguments to the kernel at
build time. On some systems (e.g. embedded ones), it is
necessary or convenient to provide some or all of the
string "Built-in kernel command string"
depends on CMDLINE_BOOL
default ""
- help
+ ---help---
Enter arguments here that should be compiled into the kernel
image and used at boot time. If the boot loader provides a
command line at boot time, it is appended to this string to
bool "Built-in command line overrides boot loader arguments"
default n
depends on CMDLINE_BOOL
- help
+ ---help---
Set this option to 'Y' to have the kernel ignore the boot loader
command line, and use ONLY the built-in command line.
depends on NUMA
menu "Power management and ACPI options"
- depends on !X86_VOYAGER
config ARCH_HIBERNATION_HEADER
def_bool y
config APM_IGNORE_USER_SUSPEND
bool "Ignore USER SUSPEND"
- help
+ ---help---
This option will ignore USER SUSPEND requests. On machines with a
compliant APM BIOS, you want to say N. However, on the NEC Versa M
series notebooks, it is necessary to say Y because of a BIOS bug.
config APM_CPU_IDLE
bool "Make CPU Idle calls when idle"
- help
+ ---help---
Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
On some machines, this can activate improved power savings, such as
a slowed CPU clock rate, when the machine is idle. These idle calls
config APM_DISPLAY_BLANK
bool "Enable console blanking using APM"
- help
+ ---help---
Enable console blanking using the APM. Some laptops can use this to
turn off the LCD backlight when the screen blanker of the Linux
virtual console blanks the screen. Note that this is only used by
config APM_ALLOW_INTS
bool "Allow interrupts during APM BIOS calls"
- help
+ ---help---
Normally we disable external interrupts while we are making calls to
the APM BIOS as a measure to lessen the effects of a badly behaving
BIOS implementation. The BIOS should reenable interrupts if it
bool "PCI support"
default y
select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
- help
+ ---help---
Find out whether you have a PCI motherboard. PCI is the name of a
bus system, i.e. the way the CPU talks to the other stuff inside
your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
- help
+ ---help---
DMA remapping (DMAR) devices support enables independent address
translations for Direct Memory Access (DMA) from devices.
These DMA remapping devices are reported via ACPI tables
def_bool y
prompt "Support for Graphics workaround"
depends on DMAR
- help
- Current Graphics drivers tend to use physical address
- for DMA and avoid using DMA APIs. Setting this config
- option permits the IOMMU driver to set a unity map for
- all the OS-visible memory. Hence the driver can continue
- to use physical addresses for DMA.
+ ---help---
+ Current Graphics drivers tend to use physical address
+ for DMA and avoid using DMA APIs. Setting this config
+ option permits the IOMMU driver to set a unity map for
+ all the OS-visible memory. Hence the driver can continue
+ to use physical addresses for DMA.
config DMAR_FLOPPY_WA
def_bool y
depends on DMAR
- help
- Floppy disk drivers are know to bypass DMA API calls
- thereby failing to work when IOMMU is enabled. This
- workaround will setup a 1:1 mapping for the first
- 16M to make floppy (an ISA device) work.
+ ---help---
+ Floppy disk drivers are know to bypass DMA API calls
+ thereby failing to work when IOMMU is enabled. This
+ workaround will setup a 1:1 mapping for the first
+ 16M to make floppy (an ISA device) work.
config INTR_REMAP
bool "Support for Interrupt Remapping (EXPERIMENTAL)"
depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
- help
- Supports Interrupt remapping for IO-APIC and MSI devices.
- To use x2apic mode in the CPU's which support x2APIC enhancements or
- to support platforms with CPU's having > 8 bit APIC ID, say Y.
+ select X86_X2APIC
+ ---help---
+ Supports Interrupt remapping for IO-APIC and MSI devices.
+ To use x2apic mode in the CPU's which support x2APIC enhancements or
+ to support platforms with CPU's having > 8 bit APIC ID, say Y.
source "drivers/pci/pcie/Kconfig"
config ISA
bool "ISA support"
- depends on !X86_VOYAGER
- help
+ ---help---
Find out whether you have ISA slots on your motherboard. ISA is the
name of a bus system, i.e. the way the CPU talks to the other stuff
inside your box. Other bus systems are PCI, EISA, MicroChannel
source "drivers/eisa/Kconfig"
config MCA
- bool "MCA support" if !X86_VOYAGER
- default y if X86_VOYAGER
- help
+ bool "MCA support"
+ ---help---
MicroChannel Architecture is found in some IBM PS/2 machines and
laptops. It is a bus system similar to PCI or ISA. See
<file:Documentation/mca.txt> (and especially the web page given
config SCx200
tristate "NatSemi SCx200 support"
- depends on !X86_VOYAGER
- help
+ ---help---
This provides basic support for National Semiconductor's
(now AMD's) Geode processors. The driver probes for the
PCI-IDs of several on-chip devices, so its a good dependency
tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
depends on SCx200 && GENERIC_TIME
default y
- help
+ ---help---
This driver provides a clocksource built upon the on-chip
27MHz high-resolution timer. Its also a workaround for
NSC Geode SC-1100's buggy TSC, which loses time when the
def_bool y
prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
- help
+ ---help---
This driver provides a clock event source based on the MFGPT
timer(s) in the CS5535 and CS5536 companion chip for the geode.
MFGPTs have a better resolution and max interval than the
config OLPC
bool "One Laptop Per Child support"
default n
- help
+ ---help---
Add support for detecting the unique features of the OLPC
XO hardware.
bool "IA32 Emulation"
depends on X86_64
select COMPAT_BINFMT_ELF
- help
+ ---help---
Include code to run 32-bit programs under a 64-bit kernel. You should
likely turn this on, unless you're 100% sure that you don't have any
32-bit programs left.
config IA32_AOUT
- tristate "IA32 a.out support"
- depends on IA32_EMULATION
- help
- Support old a.out binaries in the 32bit emulation.
+ tristate "IA32 a.out support"
+ depends on IA32_EMULATION
+ ---help---
+ Support old a.out binaries in the 32bit emulation.
config COMPAT
def_bool y
+ /*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
-#ifdef CONFIG_EFI
-#include <asm/efi.h>
-#endif
+ #ifndef __ASSEMBLY__
+ #include <linux/kernel.h>
+ #include <asm/acpi.h>
+ #include <asm/apicdef.h>
+ #include <asm/page.h>
+ #ifdef CONFIG_X86_32
+ #include <linux/threads.h>
+ #include <asm/kmap_types.h>
+ #else
+ #include <asm/vsyscall.h>
+ #endif
+
+ /*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+ #ifdef CONFIG_X86_32
+ /* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+ extern unsigned long __FIXADDR_TOP;
+ #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+
+ #define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
+ #define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
+ #else
+ #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+
+ /* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+ #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
+ #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
+ #endif
+
+
+ /*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+ enum fixed_addresses {
#ifdef CONFIG_X86_32
- # include "fixmap_32.h"
+ FIX_HOLE,
+ FIX_VDSO,
#else
- # include "fixmap_64.h"
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VSYSCALL_HPET,
#endif
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_EFI
- FIX_EFI_IO_MAP_LAST_PAGE,
- FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
- + MAX_EFI_IO_PAGES - 1,
-#endif
-#endif
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+ #ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+ #endif
+ #ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+ #endif
+ #ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+ #endif
+ #ifdef CONFIG_X86_F00F_BUG
+ FIX_F00F_IDT, /* Virtual mapping for IDT */
+ #endif
+ #ifdef CONFIG_X86_CYCLONE_TIMER
+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
+ #endif
+ #ifdef CONFIG_X86_32
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+ #ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+ #endif
+ #endif
+ #ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+ #endif
+ __end_of_permanent_fixed_addresses,
+ #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+ #endif
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 256 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+ #define NR_FIX_BTMAPS 64
+ #define FIX_BTMAPS_SLOTS 4
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+ (__end_of_permanent_fixed_addresses & 255),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+ #ifdef CONFIG_X86_32
+ FIX_WP_TEST,
+ #endif
+ __end_of_fixed_addresses
+ };
+
+
+ extern void reserve_top_address(unsigned long reserve);
+
+ #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+ #define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+ #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+ #define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
extern int fixmaps_set;
BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
return __virt_to_fix(vaddr);
}
+ #endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
+ * acpi-cpufreq.c - ACPI Processor P-States Driver
*
#include <linux/ftrace.h>
#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+
#include <acpi/processor.h>
-#include <asm/io.h>
#include <asm/msr.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
-#include <asm/delay.h>
-#include <asm/uaccess.h>
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+ "acpi-cpufreq", msg)
MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
MODULE_DESCRIPTION("ACPI Processor P-States Driver");
perf = data->acpi_data;
- for (i=0; i<perf->state_count; i++) {
+ for (i = 0; i < perf->state_count; i++) {
if (value == perf->states[i].status)
return data->freq_table[i].frequency;
}
msr &= INTEL_MSR_RANGE;
perf = data->acpi_data;
- for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+ for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
if (msr == perf->states[data->freq_table[i].index].status)
return data->freq_table[i].frequency;
}
u8 bit_width;
};
-typedef union {
- struct msr_addr msr;
- struct io_addr io;
-} drv_addr_union;
-
struct drv_cmd {
unsigned int type;
const struct cpumask *mask;
- drv_addr_union addr;
+ union {
+ struct msr_addr msr;
+ struct io_addr io;
+ } addr;
u32 val;
};
unsigned int cur_freq;
unsigned int i;
- for (i=0; i<100; i++) {
+ for (i = 0; i < 100; i++) {
cur_freq = extract_freq(get_cur_val(mask), data);
if (cur_freq == freq)
return 1;
unsigned long freq;
unsigned long freqn = perf->states[0].core_frequency * 1000;
- for (i=0; i<(perf->state_count-1); i++) {
+ for (i = 0; i < (perf->state_count-1); i++) {
freq = freqn;
freqn = perf->states[i+1].core_frequency * 1000;
if ((2 * cpu_khz) > (freqn + freq)) {
if (!data)
return -ENOMEM;
- data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+ data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
per_cpu(drv_data, cpu) = data;
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
/* detect transition latency */
policy->cpuinfo.transition_latency = 0;
- for (i=0; i<perf->state_count; i++) {
+ for (i = 0; i < perf->state_count; i++) {
if ((perf->states[i].transition_latency * 1000) >
policy->cpuinfo.transition_latency)
policy->cpuinfo.transition_latency =
data->max_freq = perf->states[0].core_frequency * 1000;
/* table init */
- for (i=0; i<perf->state_count; i++) {
- if (i>0 && perf->states[i].core_frequency >=
+ for (i = 0; i < perf->state_count; i++) {
+ if (i > 0 && perf->states[i].core_frequency >=
data->freq_table[valid_states-1].frequency / 1000)
continue;
#include <linux/cpufreq.h>
#include <linux/ioport.h>
#include <linux/slab.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+#include <linux/delay.h>
#include <asm/msr.h>
#include <asm/tsc.h>
-#include <asm/timex.h>
-#include <asm/io.h>
-#include <asm/delay.h>
#define EPS_BRAND_C7M 0
#define EPS_BRAND_C7 1
break;
}
- switch(brand) {
+ switch (brand) {
case EPS_BRAND_C7M:
printk(KERN_CONT "C7-M\n");
break;
}
/* Enable Enhanced PowerSaver */
rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & 1 << 16)) {
- val |= 1 << 16;
+ if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+ val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
wrmsrl(MSR_IA32_MISC_ENABLE, val);
/* Can be locked at 0 */
rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & 1 << 16)) {
+ if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
return -ENODEV;
}
/* Print voltage and multiplier */
rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
+ printk(KERN_INFO "eps: Current voltage = %dmV\n",
+ current_voltage * 16 + 700);
current_multiplier = (lo >> 8) & 0xff;
printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
/* Print limits */
max_voltage = hi & 0xff;
- printk(KERN_INFO "eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
+ printk(KERN_INFO "eps: Highest voltage = %dmV\n",
+ max_voltage * 16 + 700);
max_multiplier = (hi >> 8) & 0xff;
printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
min_voltage = (hi >> 16) & 0xff;
- printk(KERN_INFO "eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
+ printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
+ min_voltage * 16 + 700);
min_multiplier = (hi >> 24) & 0xff;
printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
return 0;
}
-static struct freq_attr* eps_attr[] = {
+static struct freq_attr *eps_attr[] = {
&cpufreq_freq_attr_scaling_available_freqs,
NULL,
};
cpufreq_unregister_driver(&eps_driver);
}
MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
MODULE_LICENSE("GPL");
#include <linux/string.h>
#include <linux/bitops.h>
#include <linux/smp.h>
+#include <linux/sched.h>
#include <linux/thread_info.h>
#include <linux/module.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
- #include <mach_apic.h>
#endif
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
- * with P/T states and does not stop in deep C-states
+ * with P/T states and does not stop in deep C-states.
+ *
+ * It is also reliable across cores and sockets. (but not across
+ * cabinets - we turn it off in that case explicitly.)
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+ sched_clock_stable = 1;
}
+ /*
+ * There is a known erratum on Pentium III and Core Solo
+ * and Core Duo CPUs.
+ * " Page with PAT set to WC while associated MTRR is UC
+ * may consolidate to UC "
+ * Because of this erratum, it is better to stick with
+ * setting WC in MTRR rather than using PAT on these CPUs.
+ *
+ * Enable PAT WC only on P4, Core 2 or later CPUs.
+ */
+ if (c->x86 == 6 && c->x86_model < 15)
+ clear_cpu_cap(c, X86_FEATURE_PAT);
}
#ifdef CONFIG_X86_32
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & (1<<9)) == 0) {
+ if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= (1<<9); /* Disable hw prefetching */
+ lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
}
}
SMBIOS_TABLE_GUID)) {
efi.smbios = config_tables[i].table;
printk(" SMBIOS=0x%lx ", config_tables[i].table);
+ #ifdef CONFIG_X86_UV
} else if (!efi_guidcmp(config_tables[i].guid,
UV_SYSTEM_TABLE_GUID)) {
efi.uv_systab = config_tables[i].table;
printk(" UVsystab=0x%lx ", config_tables[i].table);
+ #endif
} else if (!efi_guidcmp(config_tables[i].guid,
HCDP_TABLE_GUID)) {
efi.hcdp = config_tables[i].table;
efi_memory_desc_t *md;
efi_status_t status;
unsigned long size;
- u64 end, systab, addr, npages;
+ u64 end, systab, addr, npages, end_pfn;
void *p, *va;
efi.systab = NULL;
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
- if (PFN_UP(end) <= max_low_pfn_mapped)
+ end_pfn = PFN_UP(end);
+ if (end_pfn <= max_low_pfn_mapped
+ || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+ && end_pfn <= max_pfn_mapped))
va = __va(md->phys_addr);
else
va = efi_ioremap(md->phys_addr, size);
#include <asm/proto.h>
#include <asm/efi.h>
#include <asm/cacheflush.h>
+ #include <asm/fixmap.h>
static pgd_t save_pgd __initdata;
static unsigned long efi_flags __initdata;
void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
{
- static unsigned pages_mapped __initdata;
- unsigned i, pages;
- unsigned long offset;
+ unsigned long last_map_pfn;
- pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
-
- if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+ last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+ if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
return NULL;
- for (i = 0; i < pages; i++) {
- __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
- phys_addr, PAGE_KERNEL);
- phys_addr += PAGE_SIZE;
- pages_mapped++;
- }
-
- return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
- (pages_mapped - pages)) + offset;
+ return (void __iomem *)__va(phys_addr);
}
#include <asm/reboot.h>
#include <asm/pci_x86.h>
#include <asm/virtext.h>
+ #include <asm/cpu.h>
#ifdef CONFIG_X86_32
# include <linux/dmi.h>
# include <asm/iommu.h>
#endif
- #include <mach_ipi.h>
-
/*
* Power off function, if any
*/
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
+ { /* Handle problems with rebooting on Dell XPS710 */
+ .callback = set_bios_reboot,
+ .ident = "Dell XPS710",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ },
+ },
{ }
};
static void smp_send_nmi_allbutself(void)
{
- send_IPI_allbutself(NMI_VECTOR);
+ apic->send_IPI_allbutself(NMI_VECTOR);
}
static struct notifier_block crash_nmi_nb = {
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
- #include <asm/arch_hooks.h>
#include <asm/efi.h>
+ #include <asm/timer.h>
+ #include <asm/i8259.h>
#include <asm/sections.h>
#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/vmi.h>
- #include <setup_arch.h>
+ #include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/vsyscall.h>
- #include <asm/smp.h>
+ #include <asm/cpu.h>
#include <asm/desc.h>
#include <asm/dma.h>
#include <asm/iommu.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
- #include <mach_apic.h>
#include <asm/paravirt.h>
#include <asm/hypervisor.h>
#define ARCH_SETUP
#endif
+ unsigned int boot_cpu_id __read_mostly;
+
+ #ifdef CONFIG_X86_64
+ int default_cpu_present_to_apicid(int mps_cpu)
+ {
+ return __default_cpu_present_to_apicid(mps_cpu);
+ }
+
+ int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+ {
+ return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+ }
+ #endif
+
#ifndef CONFIG_DEBUG_BOOT_PARAMS
struct boot_params __initdata boot_params;
#else
early_param("elfcorehdr", setup_elfcorehdr);
#endif
- static int __init default_update_genapic(void)
- {
- #ifdef CONFIG_X86_SMP
- # if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
- genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
- # endif
- #endif
-
- return 0;
- }
-
- static struct x86_quirks default_x86_quirks __initdata = {
- .update_genapic = default_update_genapic,
- };
+ static struct x86_quirks default_x86_quirks __initdata;
struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
- pre_setup_arch_hook();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
finish_e820_parsing();
+ if (efi_enabled)
+ efi_init();
+
dmi_scan_machine();
dmi_check_system(bad_bios_dmi_table);
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
- if (efi_enabled)
- efi_init();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
#else
num_physpages = max_pfn;
- if (cpu_has_x2apic)
- check_x2apic();
+ check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
reserve_initrd();
- #ifdef CONFIG_X86_64
vsmp_init();
- #endif
io_delay_init();
*/
acpi_reserve_bootmem();
#endif
- #ifdef CONFIG_X86_FIND_SMP_CONFIG
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
- #endif
+
reserve_crashkernel();
#ifdef CONFIG_X86_64
map_vsyscall();
#endif
- #ifdef CONFIG_X86_GENERICARCH
generic_apic_probe();
- #endif
early_quirks();
#endif
}
+ #ifdef CONFIG_X86_32
+ /**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ * Perform any necessary interrupt initialisation prior to setting up
+ * the "ordinary" interrupt call gates. For legacy reasons, the ISA
+ * interrupts should be initialised here if the machine emulates a PC
+ * in any way.
+ **/
+ void __init x86_quirk_pre_intr_init(void)
+ {
+ if (x86_quirks->arch_pre_intr_init) {
+ if (x86_quirks->arch_pre_intr_init())
+ return;
+ }
+ init_ISA_irqs();
+ }
+
+ /**
+ * x86_quirk_intr_init - post gate setup interrupt initialisation
+ *
+ * Description:
+ * Fill in any interrupts that may have been left out by the general
+ * init_IRQ() routine. interrupts having to do with the machine rather
+ * than the devices on the I/O bus (like APIC interrupts in intel MP
+ * systems) are started here.
+ **/
+ void __init x86_quirk_intr_init(void)
+ {
+ if (x86_quirks->arch_intr_init) {
+ if (x86_quirks->arch_intr_init())
+ return;
+ }
+ }
+
+ /**
+ * x86_quirk_trap_init - initialise system specific traps
+ *
+ * Description:
+ * Called as the final act of trap_init(). Used in VISWS to initialise
+ * the various board specific APIC traps.
+ **/
+ void __init x86_quirk_trap_init(void)
+ {
+ if (x86_quirks->arch_trap_init) {
+ if (x86_quirks->arch_trap_init())
+ return;
+ }
+ }
+
+ static struct irqaction irq0 = {
+ .handler = timer_interrupt,
+ .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .mask = CPU_MASK_NONE,
+ .name = "timer"
+ };
+
+ /**
+ * x86_quirk_pre_time_init - do any specific initialisations before.
+ *
+ **/
+ void __init x86_quirk_pre_time_init(void)
+ {
+ if (x86_quirks->arch_pre_time_init)
+ x86_quirks->arch_pre_time_init();
+ }
+
+ /**
+ * x86_quirk_time_init - do any specific initialisations for the system timer.
+ *
+ * Description:
+ * Must plug the system timer interrupt source at HZ into the IRQ listed
+ * in irq_vectors.h:TIMER_IRQ
+ **/
+ void __init x86_quirk_time_init(void)
+ {
+ if (x86_quirks->arch_time_init) {
+ /*
+ * A nonzero return code does not mean failure, it means
+ * that the architecture quirk does not want any
+ * generic (timer) setup to be performed after this:
+ */
+ if (x86_quirks->arch_time_init())
+ return;
+ }
+
+ irq0.mask = cpumask_of_cpu(0);
+ setup_irq(0, &irq0);
+ }
+ #endif /* CONFIG_X86_32 */
#include <asm/delay.h>
#include <asm/hypervisor.h>
-unsigned int cpu_khz; /* TSC clocks / usec, not used here */
+unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
-unsigned int tsc_khz;
+
+unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);
/*
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
-static int tsc_unstable;
+static int __read_mostly tsc_unstable;
/* native_sched_clock() is called before tsc_init(), so
we must start with the TSC soft disabled to prevent
erroneous rdtsc usage on !cpu_has_tsc processors */
-static int tsc_disabled = -1;
+static int __read_mostly tsc_disabled = -1;
static int tsc_clocksource_reliable;
/*
* use the TSC value at the transitions to calculate a pretty
* good value for the TSC frequencty.
*/
-static inline int pit_expect_msb(unsigned char val)
+static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
- int count = 0;
+ int count;
+ u64 tsc = 0;
for (count = 0; count < 50000; count++) {
/* Ignore LSB */
inb(0x42);
if (inb(0x42) != val)
break;
+ tsc = get_cycles();
}
- return count > 50;
+ *deltap = get_cycles() - tsc;
+ *tscp = tsc;
+
+ /*
+ * We require _some_ success, but the quality control
+ * will be based on the error terms on the TSC values.
+ */
+ return count > 5;
}
/*
- * How many MSB values do we want to see? We aim for a
- * 15ms calibration, which assuming a 2us counter read
- * error should give us roughly 150 ppm precision for
- * the calibration.
+ * How many MSB values do we want to see? We aim for
+ * a maximum error rate of 500ppm (in practice the
+ * real error is much smaller), but refuse to spend
+ * more than 25ms on it.
*/
-#define QUICK_PIT_MS 15
-#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+#define MAX_QUICK_PIT_MS 25
+#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
static unsigned long quick_pit_calibrate(void)
{
+ int i;
+ u64 tsc, delta;
+ unsigned long d1, d2;
+
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
outb(0xff, 0x42);
outb(0xff, 0x42);
- if (pit_expect_msb(0xff)) {
- int i;
- u64 t1, t2, delta;
- unsigned char expect = 0xfe;
-
- t1 = get_cycles();
- for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
- if (!pit_expect_msb(expect))
- goto failed;
+ /*
+ * The PIT starts counting at the next edge, so we
+ * need to delay for a microsecond. The easiest way
+ * to do that is to just read back the 16-bit counter
+ * once from the PIT.
+ */
+ inb(0x42);
+ inb(0x42);
+
+ if (pit_expect_msb(0xff, &tsc, &d1)) {
+ for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
+ if (!pit_expect_msb(0xff-i, &delta, &d2))
+ break;
+
+ /*
+ * Iterate until the error is less than 500 ppm
+ */
+ delta -= tsc;
+ if (d1+d2 < delta >> 11)
+ goto success;
}
- t2 = get_cycles();
-
- /*
- * Make sure we can rely on the second TSC timestamp:
- */
- if (!pit_expect_msb(expect))
- goto failed;
-
- /*
- * Ok, if we get here, then we've seen the
- * MSB of the PIT decrement QUICK_PIT_ITERATIONS
- * times, and each MSB had many hits, so we never
- * had any sudden jumps.
- *
- * As a result, we can depend on there not being
- * any odd delays anywhere, and the TSC reads are
- * reliable.
- *
- * kHz = ticks / time-in-seconds / 1000;
- * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
- * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
- */
- delta = (t2 - t1)*PIT_TICK_RATE;
- do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
- printk("Fast TSC calibration using PIT\n");
- return delta;
}
-failed:
+ printk("Fast TSC calibration failed\n");
return 0;
+
+success:
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement 'i' times, and the
+ * error has shrunk to less than 500 ppm.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable (within the error). We also adjust the
+ * delta to the middle of the error bars, just
+ * because it looks nicer.
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
+ * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
+ */
+ delta += (long)(d2 - d1)/2;
+ delta *= PIT_TICK_RATE;
+ do_div(delta, i*256*1000);
+ printk("Fast TSC calibration using PIT\n");
+ return delta;
}
/**
return tsc_pit_min;
}
-#ifdef CONFIG_X86_32
-/* Only called from the Powernow K7 cpu freq driver */
int recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
EXPORT_SYMBOL(recalibrate_cpu_khz);
-#endif /* CONFIG_X86_32 */
/* Accelerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
if (!cpu_has_tsc || tsc_unstable)
return 1;
- #ifdef CONFIG_X86_SMP
+ #ifdef CONFIG_SMP
if (apic_is_clustered_box())
return 1;
#endif
{
return lguest_data.irq_enabled;
}
+ PV_CALLEE_SAVE_REGS_THUNK(save_fl);
/* restore_flags() just sets the flags back to the value given. */
static void restore_fl(unsigned long flags)
{
lguest_data.irq_enabled = flags;
}
+ PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
/* Interrupts go off... */
static void irq_disable(void)
{
lguest_data.irq_enabled = 0;
}
+ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
/* Interrupts go on... */
static void irq_enable(void)
{
lguest_data.irq_enabled = X86_EFLAGS_IF;
}
+ PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+
/*:*/
/*M:003 Note that we don't check for outstanding interrupts when we re-enable
* them (or when we unmask an interrupt). This seems to work for the moment,
/* There's one problem which normal hardware doesn't have: the Host
* can't handle us removing entries we're currently using. So we clear
* the GS register here: if it's needed it'll be reloaded anyway. */
- loadsegment(gs, 0);
+ lazy_load_gs(0);
lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
}
* flush_tlb_user() for both user and kernel mappings unless
* the Page Global Enable (PGE) feature bit is set. */
*dx |= 0x00002000;
+ /* We also lie, and say we're family id 5. 6 or greater
+ * leads to a rdmsr in early_init_intel which we can't handle.
+ * Family ID is returned as bits 8-12 in ax. */
+ *ax &= 0xFFFFF0FF;
+ *ax |= 0x00000500;
break;
case 0x80000000:
/* Futureproof this a little: if they ask how much extended
/* Some systems map "vectors" to interrupts weirdly. Lguest has
* a straightforward 1 to 1 mapping, so force that here. */
__get_cpu_var(vector_irq)[vector] = i;
- if (vector != SYSCALL_VECTOR) {
- set_intr_gate(vector,
- interrupt[vector-FIRST_EXTERNAL_VECTOR]);
- set_irq_chip_and_handler_name(i, &lguest_irq_controller,
- handle_level_irq,
- "level");
- }
+ if (vector != SYSCALL_VECTOR)
+ set_intr_gate(vector, interrupt[i]);
}
/* This call is required to set up for 4k stacks, where we have
* separate stacks for hard and soft interrupts. */
irq_ctx_init(smp_processor_id());
}
+void lguest_setup_irq(unsigned int irq)
+{
+ irq_to_desc_alloc_cpu(irq, 0);
+ set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+ handle_level_irq, "level");
+}
+
/*
* Time.
*
return 0;
}
- static struct apic_ops lguest_basic_apic_ops = {
- .read = lguest_apic_read,
- .write = lguest_apic_write,
- .icr_read = lguest_apic_icr_read,
- .icr_write = lguest_apic_icr_write,
- .wait_icr_idle = lguest_apic_wait_icr_idle,
- .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
+ static void set_lguest_basic_apic_ops(void)
+ {
+ apic->read = lguest_apic_read;
+ apic->write = lguest_apic_write;
+ apic->icr_read = lguest_apic_icr_read;
+ apic->icr_write = lguest_apic_icr_write;
+ apic->wait_icr_idle = lguest_apic_wait_icr_idle;
+ apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
};
#endif
/* interrupt-related operations */
pv_irq_ops.init_IRQ = lguest_init_IRQ;
- pv_irq_ops.save_fl = save_fl;
- pv_irq_ops.restore_fl = restore_fl;
- pv_irq_ops.irq_disable = irq_disable;
- pv_irq_ops.irq_enable = irq_enable;
+ pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
pv_irq_ops.safe_halt = lguest_safe_halt;
/* init-time operations */
#ifdef CONFIG_X86_LOCAL_APIC
/* apic read/write intercepts */
- apic_ops = &lguest_basic_apic_ops;
+ set_lguest_basic_apic_ops();
#endif
/* time operations */
pbase = (pte_t *)page_address(base);
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+ /*
+ * If we ever want to utilize the PAT bit, we need to
+ * update this function to make sure it's converted from
+ * bit 12 to bit 7 when we cross from the 2MB level to
+ * the 4K level:
+ */
+ WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
#ifdef CONFIG_X86_64
if (level == PG_LEVEL_1G) {
* primary protection behavior:
*/
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+ /*
+ * Intel Atom errata AAH41 workaround.
+ *
+ * The real fix should be in hw or in a microcode update, but
+ * we also probabilistically try to reduce the window of having
+ * a large TLB mixed with 4K TLBs while instruction fetches are
+ * going on.
+ */
+ __flush_tlb_all();
+
base = NULL;
out_unlock:
}
EXPORT_SYMBOL_GPL(acpi_os_map_memory);
- void acpi_os_unmap_memory(void __iomem * virt, acpi_size size)
+ void __ref acpi_os_unmap_memory(void __iomem *virt, acpi_size size)
{
- if (acpi_gbl_permanent_mmap) {
+ if (acpi_gbl_permanent_mmap)
iounmap(virt);
- }
+ else
+ __acpi_unmap_table(virt, size);
}
EXPORT_SYMBOL_GPL(acpi_os_unmap_memory);
+ void __init early_acpi_os_unmap_memory(void __iomem *virt, acpi_size size)
+ {
+ if (!acpi_gbl_permanent_mmap)
+ __acpi_unmap_table(virt, size);
+ }
+
#ifdef ACPI_FUTURE_USAGE
acpi_status
acpi_os_get_physical_address(void *virt, acpi_physical_address * phys)
return AE_SUPPORT;
}
-#ifdef CONFIG_X86
-
-struct aml_port_desc {
- uint start;
- uint end;
- char* name;
- char warned;
-};
-
-static struct aml_port_desc aml_invalid_port_list[] = {
- {0x20, 0x21, "PIC0", 0},
- {0xA0, 0xA1, "PIC1", 0},
- {0x4D0, 0x4D1, "ELCR", 0}
-};
-
-/*
- * valid_aml_io_address()
- *
- * if valid, return true
- * else invalid, warn once, return false
- */
-static bool valid_aml_io_address(uint address, uint length)
-{
- int i;
- int entries = sizeof(aml_invalid_port_list) / sizeof(struct aml_port_desc);
-
- for (i = 0; i < entries; ++i) {
- if ((address >= aml_invalid_port_list[i].start &&
- address <= aml_invalid_port_list[i].end) ||
- (address + length >= aml_invalid_port_list[i].start &&
- address + length <= aml_invalid_port_list[i].end))
- {
- if (!aml_invalid_port_list[i].warned)
- {
- printk(KERN_ERR "ACPI: Denied BIOS AML access"
- " to invalid port 0x%x+0x%x (%s)\n",
- address, length,
- aml_invalid_port_list[i].name);
- aml_invalid_port_list[i].warned = 1;
- }
- return false; /* invalid */
- }
- }
- return true; /* valid */
-}
-#else
-static inline bool valid_aml_io_address(uint address, uint length) { return true; }
-#endif
/******************************************************************************
*
* FUNCTION: acpi_os_validate_address
switch (space_id) {
case ACPI_ADR_SPACE_SYSTEM_IO:
- if (!valid_aml_io_address(address, length))
- return AE_AML_ILLEGAL_ADDRESS;
case ACPI_ADR_SPACE_SYSTEM_MEMORY:
/* Only interference checks against SystemIO and SytemMemory
are needed */
module_param(phy_flash_cfg, int, 0644);
MODULE_PARM_DESC(phy_flash_cfg, "Set PHYs into reflash mode initially");
+static unsigned irq_adapt_low_thresh = 10000;
+module_param(irq_adapt_low_thresh, uint, 0644);
+MODULE_PARM_DESC(irq_adapt_low_thresh,
+ "Threshold score for reducing IRQ moderation");
+
+static unsigned irq_adapt_high_thresh = 20000;
+module_param(irq_adapt_high_thresh, uint, 0644);
+MODULE_PARM_DESC(irq_adapt_high_thresh,
+ "Threshold score for increasing IRQ moderation");
+
/**************************************************************************
*
* Utility functions and prototypes
channel->rx_pkt = NULL;
}
- efx_flush_lro(channel);
efx_rx_strategy(channel);
efx_fast_push_rx_descriptors(&efx->rx_queue[channel->channel]);
rx_packets = efx_process_channel(channel, budget);
if (rx_packets < budget) {
+ struct efx_nic *efx = channel->efx;
+
+ if (channel->used_flags & EFX_USED_BY_RX &&
+ efx->irq_rx_adaptive &&
+ unlikely(++channel->irq_count == 1000)) {
+ unsigned old_irq_moderation = channel->irq_moderation;
+
+ if (unlikely(channel->irq_mod_score <
+ irq_adapt_low_thresh)) {
+ channel->irq_moderation =
+ max_t(int,
+ channel->irq_moderation -
+ FALCON_IRQ_MOD_RESOLUTION,
+ FALCON_IRQ_MOD_RESOLUTION);
+ } else if (unlikely(channel->irq_mod_score >
+ irq_adapt_high_thresh)) {
+ channel->irq_moderation =
+ min(channel->irq_moderation +
+ FALCON_IRQ_MOD_RESOLUTION,
+ efx->irq_rx_moderation);
+ }
+
+ if (channel->irq_moderation != old_irq_moderation)
+ falcon_set_int_moderation(channel);
+
+ channel->irq_count = 0;
+ channel->irq_mod_score = 0;
+ }
+
/* There is no race here; although napi_disable() will
- * only wait for netif_rx_complete(), this isn't a problem
+ * only wait for napi_complete(), this isn't a problem
* since efx_channel_processed() will have no effect if
* interrupts have already been disabled.
*/
- netif_rx_complete(napi);
+ napi_complete(napi);
efx_channel_processed(channel);
}
}
+static void efx_fini_port(struct efx_nic *efx);
+
/* This call reinitialises the MAC to pick up new PHY settings. The
* caller must hold the mac_lock */
void __efx_reconfigure_port(struct efx_nic *efx)
fail:
EFX_ERR(efx, "failed to reconfigure MAC\n");
- efx->phy_op->fini(efx);
- efx->port_initialized = false;
+ efx->port_enabled = false;
+ efx_fini_port(efx);
}
/* Reinitialise the MAC to pick up new PHY settings, even if the port is
* interrupts across them. */
static int efx_wanted_rx_queues(void)
{
- cpumask_t core_mask;
+ cpumask_var_t core_mask;
int count;
int cpu;
- cpus_clear(core_mask);
+ if (!alloc_cpumask_var(&core_mask, GFP_KERNEL)) {
+ printk(KERN_WARNING
+ "efx.c: allocation failure, irq balancing hobbled\n");
+ return 1;
+ }
+
+ cpumask_clear(core_mask);
count = 0;
for_each_online_cpu(cpu) {
- if (!cpu_isset(cpu, core_mask)) {
+ if (!cpumask_test_cpu(cpu, core_mask)) {
++count;
- cpus_or(core_mask, core_mask,
- topology_core_siblings(cpu));
+ cpumask_or(core_mask, core_mask,
+ topology_core_cpumask(cpu));
}
}
+ free_cpumask_var(core_mask);
return count;
}
efx_set_channels(efx);
/* Initialise the interrupt moderation settings */
- efx_init_irq_moderation(efx, tx_irq_mod_usec, rx_irq_mod_usec);
+ efx_init_irq_moderation(efx, tx_irq_mod_usec, rx_irq_mod_usec, true);
return 0;
}
**************************************************************************/
/* Set interrupt moderation parameters */
-void efx_init_irq_moderation(struct efx_nic *efx, int tx_usecs, int rx_usecs)
+void efx_init_irq_moderation(struct efx_nic *efx, int tx_usecs, int rx_usecs,
+ bool rx_adaptive)
{
struct efx_tx_queue *tx_queue;
struct efx_rx_queue *rx_queue;
efx_for_each_tx_queue(tx_queue, efx)
tx_queue->channel->irq_moderation = tx_usecs;
+ efx->irq_rx_adaptive = rx_adaptive;
+ efx->irq_rx_moderation = rx_usecs;
efx_for_each_rx_queue(rx_queue, efx)
rx_queue->channel->irq_moderation = rx_usecs;
}
static int efx_init_napi(struct efx_nic *efx)
{
struct efx_channel *channel;
- int rc;
efx_for_each_channel(channel, efx) {
channel->napi_dev = efx->net_dev;
- rc = efx_lro_init(&channel->lro_mgr, efx);
- if (rc)
- goto err;
}
return 0;
- err:
- efx_fini_napi(efx);
- return rc;
}
static void efx_fini_napi(struct efx_nic *efx)
struct efx_channel *channel;
efx_for_each_channel(channel, efx) {
- efx_lro_fini(&channel->lro_mgr);
channel->napi_dev = NULL;
}
}
rc = efx->phy_op->init(efx);
if (rc)
ok = false;
- } else
+ }
+ if (!ok)
efx->port_initialized = false;
}
static struct efx_board efx_dummy_board_info = {
.init = efx_port_dummy_op_int,
- .init_leds = efx_port_dummy_op_int,
- .set_fault_led = efx_port_dummy_op_blink,
+ .init_leds = efx_port_dummy_op_void,
+ .set_id_led = efx_port_dummy_op_blink,
.monitor = efx_port_dummy_op_int,
.blink = efx_port_dummy_op_blink,
.fini = efx_port_dummy_op_void,
net_dev->features |= (NETIF_F_IP_CSUM | NETIF_F_SG |
NETIF_F_HIGHDMA | NETIF_F_TSO);
if (lro)
- net_dev->features |= NETIF_F_LRO;
+ net_dev->features |= NETIF_F_GRO;
/* Mask for features that also apply to VLAN devices */
net_dev->vlan_features |= (NETIF_F_ALL_CSUM | NETIF_F_SG |
NETIF_F_HIGHDMA | NETIF_F_TSO);
* @next_buffer_table: First available buffer table id
* @pci_dev2: The secondary PCI device if present
* @i2c_data: Operations and state for I2C bit-bashing algorithm
+ * @int_error_count: Number of internal errors seen recently
+ * @int_error_expire: Time at which error count will be expired
*/
struct falcon_nic_data {
unsigned next_buffer_table;
struct pci_dev *pci_dev2;
struct i2c_algo_bit_data i2c_data;
+
+ unsigned int_error_count;
+ unsigned long int_error_expire;
};
/**************************************************************************
#define FALCON_EVQ_SIZE 4096
#define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1)
-/* Max number of internal errors. After this resets will not be performed */
-#define FALCON_MAX_INT_ERRORS 4
+/* If FALCON_MAX_INT_ERRORS internal errors occur within
+ * FALCON_INT_ERROR_EXPIRE seconds, we consider the NIC broken and
+ * disable it.
+ */
+#define FALCON_INT_ERROR_EXPIRE 3600
+#define FALCON_MAX_INT_ERRORS 5
/* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times
*/
/* Dummy SRAM size code */
#define SRM_NB_BSZ_ONCHIP_ONLY (-1)
-/* Be nice if these (or equiv.) were in linux/pci_regs.h, but they're not. */
-#define PCI_EXP_DEVCAP_PWR_VAL_LBN 18
-#define PCI_EXP_DEVCAP_PWR_SCL_LBN 26
-#define PCI_EXP_DEVCTL_PAYLOAD_LBN 5
-#define PCI_EXP_LNKSTA_LNK_WID 0x3f0
-#define PCI_EXP_LNKSTA_LNK_WID_LBN 4
-
#define FALCON_IS_DUAL_FUNC(efx) \
(falcon_rev(efx) < FALCON_REV_B0)
nic_data->next_buffer_table += buffer->entries;
EFX_LOG(efx, "allocating special buffers %d-%d at %llx+%x "
- "(virt %p phys %lx)\n", buffer->index,
+ "(virt %p phys %llx)\n", buffer->index,
buffer->index + buffer->entries - 1,
- (unsigned long long)buffer->dma_addr, len,
- buffer->addr, virt_to_phys(buffer->addr));
+ (u64)buffer->dma_addr, len,
+ buffer->addr, (u64)virt_to_phys(buffer->addr));
return 0;
}
return;
EFX_LOG(efx, "deallocating special buffers %d-%d at %llx+%x "
- "(virt %p phys %lx)\n", buffer->index,
+ "(virt %p phys %llx)\n", buffer->index,
buffer->index + buffer->entries - 1,
- (unsigned long long)buffer->dma_addr, buffer->len,
- buffer->addr, virt_to_phys(buffer->addr));
+ (u64)buffer->dma_addr, buffer->len,
+ buffer->addr, (u64)virt_to_phys(buffer->addr));
pci_free_consistent(efx->pci_dev, buffer->len, buffer->addr,
buffer->dma_addr);
tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, TX_EV_DESC_PTR);
tx_ev_q_label = EFX_QWORD_FIELD(*event, TX_EV_Q_LABEL);
tx_queue = &efx->tx_queue[tx_ev_q_label];
+ channel->irq_mod_score +=
+ (tx_ev_desc_ptr - tx_queue->read_count) &
+ efx->type->txd_ring_mask;
efx_xmit_done(tx_queue, tx_ev_desc_ptr);
} else if (EFX_QWORD_FIELD(*event, TX_EV_WQ_FF_FULL)) {
/* Rewrite the FIFO write pointer */
discard = true;
}
+ channel->irq_mod_score += 2;
+
/* Handle received packet */
efx_rx_packet(rx_queue, rx_ev_desc_ptr, rx_ev_byte_cnt,
checksummed, discard);
* program is based at 0. So actual interrupt moderation
* achieved is ((x + 1) * res).
*/
- unsigned int res = 5;
- channel->irq_moderation -= (channel->irq_moderation % res);
- if (channel->irq_moderation < res)
- channel->irq_moderation = res;
+ channel->irq_moderation -= (channel->irq_moderation %
+ FALCON_IRQ_MOD_RESOLUTION);
+ if (channel->irq_moderation < FALCON_IRQ_MOD_RESOLUTION)
+ channel->irq_moderation = FALCON_IRQ_MOD_RESOLUTION;
EFX_POPULATE_DWORD_2(timer_cmd,
TIMER_MODE, TIMER_MODE_INT_HLDOFF,
TIMER_VAL,
- (channel->irq_moderation / res) - 1);
+ channel->irq_moderation /
+ FALCON_IRQ_MOD_RESOLUTION - 1);
} else {
EFX_POPULATE_DWORD_2(timer_cmd,
TIMER_MODE, TIMER_MODE_DIS,
struct efx_channel *channel = &efx->channel[0];
struct efx_tx_queue *tx_queue;
struct efx_rx_queue *rx_queue;
- unsigned int read_ptr, i;
+ unsigned int read_ptr = channel->eventq_read_ptr;
+ unsigned int end_ptr = (read_ptr - 1) & FALCON_EVQ_MASK;
- read_ptr = channel->eventq_read_ptr;
- for (i = 0; i < FALCON_EVQ_SIZE; ++i) {
+ do {
efx_qword_t *event = falcon_event(channel, read_ptr);
int ev_code, ev_sub_code, ev_queue;
bool ev_failed;
+
if (!falcon_event_present(event))
break;
ev_code = EFX_QWORD_FIELD(*event, EV_CODE);
- if (ev_code != DRIVER_EV_DECODE)
- continue;
-
ev_sub_code = EFX_QWORD_FIELD(*event, DRIVER_EV_SUB_CODE);
- switch (ev_sub_code) {
- case TX_DESCQ_FLS_DONE_EV_DECODE:
+ if (ev_code == DRIVER_EV_DECODE &&
+ ev_sub_code == TX_DESCQ_FLS_DONE_EV_DECODE) {
ev_queue = EFX_QWORD_FIELD(*event,
DRIVER_EV_TX_DESCQ_ID);
if (ev_queue < EFX_TX_QUEUE_COUNT) {
tx_queue = efx->tx_queue + ev_queue;
tx_queue->flushed = true;
}
- break;
- case RX_DESCQ_FLS_DONE_EV_DECODE:
+ } else if (ev_code == DRIVER_EV_DECODE &&
+ ev_sub_code == RX_DESCQ_FLS_DONE_EV_DECODE) {
ev_queue = EFX_QWORD_FIELD(*event,
DRIVER_EV_RX_DESCQ_ID);
ev_failed = EFX_QWORD_FIELD(*event,
else
rx_queue->flushed = true;
}
- break;
}
read_ptr = (read_ptr + 1) & FALCON_EVQ_MASK;
- }
+ } while (read_ptr != end_ptr);
}
/* Handle tx and rx flushes at the same time, since they run in
efx_oword_t *int_ker = efx->irq_status.addr;
efx_oword_t fatal_intr;
int error, mem_perr;
- static int n_int_errors;
falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER);
error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR);
pci_clear_master(nic_data->pci_dev2);
falcon_disable_interrupts(efx);
- if (++n_int_errors < FALCON_MAX_INT_ERRORS) {
+ /* Count errors and reset or disable the NIC accordingly */
+ if (nic_data->int_error_count == 0 ||
+ time_after(jiffies, nic_data->int_error_expire)) {
+ nic_data->int_error_count = 0;
+ nic_data->int_error_expire =
+ jiffies + FALCON_INT_ERROR_EXPIRE * HZ;
+ }
+ if (++nic_data->int_error_count < FALCON_MAX_INT_ERRORS) {
EFX_ERR(efx, "SYSTEM ERROR - reset scheduled\n");
efx_schedule_reset(efx, RESET_TYPE_INT_ERROR);
} else {
{
struct efx_nic *efx = dev_id;
efx_oword_t *int_ker = efx->irq_status.addr;
+ irqreturn_t result = IRQ_NONE;
struct efx_channel *channel;
efx_dword_t reg;
u32 queues;
if (unlikely(syserr))
return falcon_fatal_interrupt(efx);
- if (queues == 0)
- return IRQ_NONE;
-
- efx->last_irq_cpu = raw_smp_processor_id();
- EFX_TRACE(efx, "IRQ %d on CPU %d status " EFX_DWORD_FMT "\n",
- irq, raw_smp_processor_id(), EFX_DWORD_VAL(reg));
-
/* Schedule processing of any interrupting queues */
- channel = &efx->channel[0];
- while (queues) {
- if (queues & 0x01)
+ efx_for_each_channel(channel, efx) {
+ if ((queues & 1) ||
+ falcon_event_present(
+ falcon_event(channel, channel->eventq_read_ptr))) {
efx_schedule_channel(channel);
- channel++;
+ result = IRQ_HANDLED;
+ }
queues >>= 1;
}
- return IRQ_HANDLED;
+ if (result == IRQ_HANDLED) {
+ efx->last_irq_cpu = raw_smp_processor_id();
+ EFX_TRACE(efx, "IRQ %d on CPU %d status " EFX_DWORD_FMT "\n",
+ irq, raw_smp_processor_id(), EFX_DWORD_VAL(reg));
+ }
+
+ return result;
}
efx->phy_op = &falcon_sft9001_phy_ops;
break;
case PHY_TYPE_QT2022C2:
+ case PHY_TYPE_QT2025C:
efx->phy_op = &falcon_xfp_phy_ops;
break;
default:
FALCON_MAC_STATS_SIZE);
if (rc)
return rc;
- EFX_LOG(efx, "stats buffer at %llx (virt %p phys %lx)\n",
- (unsigned long long)efx->stats_buffer.dma_addr,
+ EFX_LOG(efx, "stats buffer at %llx (virt %p phys %llx)\n",
+ (u64)efx->stats_buffer.dma_addr,
efx->stats_buffer.addr,
- virt_to_phys(efx->stats_buffer.addr));
+ (u64)virt_to_phys(efx->stats_buffer.addr));
return 0;
}
goto fail4;
BUG_ON(efx->irq_status.dma_addr & 0x0f);
- EFX_LOG(efx, "INT_KER at %llx (virt %p phys %lx)\n",
- (unsigned long long)efx->irq_status.dma_addr,
- efx->irq_status.addr, virt_to_phys(efx->irq_status.addr));
+ EFX_LOG(efx, "INT_KER at %llx (virt %p phys %llx)\n",
+ (u64)efx->irq_status.dma_addr,
+ efx->irq_status.addr, (u64)virt_to_phys(efx->irq_status.addr));
falcon_probe_spi_devices(efx);
struct falcon_nic_data *nic_data = efx->nic_data;
int rc;
+ /* Remove I2C adapter and clear it in preparation for a retry */
rc = i2c_del_adapter(&efx->i2c_adap);
BUG_ON(rc);
+ memset(&efx->i2c_adap, 0, sizeof(efx->i2c_adap));
falcon_remove_spi_devices(efx);
falcon_free_buffer(efx, &efx->irq_status);
#include <linux/irq.h>
#include <asm/io_apic.h>
#include <asm/smp.h>
+ #include <asm/cpu.h>
#include <linux/intel-iommu.h>
#include "intr_remapping.h"
u8 irte_mask;
};
-#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_GENERIC_HARDIRQS
static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
{
struct irq_2_iommu *iommu;
VMLINUX_SYMBOL(__start___tracepoints) = .; \
*(__tracepoints) \
VMLINUX_SYMBOL(__stop___tracepoints) = .; \
+ /* implement dynamic printk debug */ \
+ . = ALIGN(8); \
+ VMLINUX_SYMBOL(__start___verbose) = .; \
+ *(__verbose) \
+ VMLINUX_SYMBOL(__stop___verbose) = .; \
LIKELY_PROFILE() \
BRANCH_PROFILE()
CPU_DISCARD(init.data) \
CPU_DISCARD(init.rodata) \
MEM_DISCARD(init.data) \
- MEM_DISCARD(init.rodata) \
- /* implement dynamic printk debug */ \
- VMLINUX_SYMBOL(__start___verbose_strings) = .; \
- *(__verbose_strings) \
- VMLINUX_SYMBOL(__stop___verbose_strings) = .; \
- . = ALIGN(8); \
- VMLINUX_SYMBOL(__start___verbose) = .; \
- *(__verbose) \
- VMLINUX_SYMBOL(__stop___verbose) = .;
+ MEM_DISCARD(init.rodata)
#define INIT_TEXT \
*(.init.text) \
*(.initcall7.init) \
*(.initcall7s.init)
+ /**
+ * PERCPU_VADDR - define output section for percpu area
+ * @vaddr: explicit base address (optional)
+ * @phdr: destination PHDR (optional)
+ *
+ * Macro which expands to output section for percpu area. If @vaddr
+ * is not blank, it specifies explicit base address and all percpu
+ * symbols will be offset from the given address. If blank, @vaddr
+ * always equals @laddr + LOAD_OFFSET.
+ *
+ * @phdr defines the output PHDR to use if not blank. Be warned that
+ * output PHDR is sticky. If @phdr is specified, the next output
+ * section in the linker script will go there too. @phdr should have
+ * a leading colon.
+ *
+ * Note that this macros defines __per_cpu_load as an absolute symbol.
+ * If there is no need to put the percpu section at a predetermined
+ * address, use PERCPU().
+ */
+ #define PERCPU_VADDR(vaddr, phdr) \
+ VMLINUX_SYMBOL(__per_cpu_load) = .; \
+ .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \
+ - LOAD_OFFSET) { \
+ VMLINUX_SYMBOL(__per_cpu_start) = .; \
+ *(.data.percpu.first) \
+ *(.data.percpu.page_aligned) \
+ *(.data.percpu) \
+ *(.data.percpu.shared_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_end) = .; \
+ } phdr \
+ . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
+
+ /**
+ * PERCPU - define output section for percpu area, simple version
+ * @align: required alignment
+ *
+ * Align to @align and outputs output section for percpu area. This
+ * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
+ * __per_cpu_start will be identical.
+ *
+ * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
+ * that __per_cpu_load is defined as a relative symbol against
+ * .data.percpu which is required for relocatable x86_32
+ * configuration.
+ */
#define PERCPU(align) \
. = ALIGN(align); \
- VMLINUX_SYMBOL(__per_cpu_start) = .; \
- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
+ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
+ VMLINUX_SYMBOL(__per_cpu_load) = .; \
+ VMLINUX_SYMBOL(__per_cpu_start) = .; \
+ *(.data.percpu.first) \
*(.data.percpu.page_aligned) \
*(.data.percpu) \
*(.data.percpu.shared_aligned) \
- } \
- VMLINUX_SYMBOL(__per_cpu_end) = .;
+ VMLINUX_SYMBOL(__per_cpu_end) = .; \
+ }
typedef irqreturn_t (*irq_handler_t)(int, void *);
+/**
+ * struct irqaction - per interrupt action descriptor
+ * @handler: interrupt handler function
+ * @flags: flags (see IRQF_* above)
+ * @mask: no comment as it is useless and about to be removed
+ * @name: name of the device
+ * @dev_id: cookie to identify the device
+ * @next: pointer to the next irqaction for shared interrupts
+ * @irq: interrupt number
+ * @dir: pointer to the proc/irq/NN/name entry
+ */
struct irqaction {
irq_handler_t handler;
unsigned long flags;
}
#endif
+#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ)
+extern void debug_poll_all_shared_irqs(void);
+#else
+static inline void debug_poll_all_shared_irqs(void) { }
+#endif
+
int show_interrupts(struct seq_file *p, void *v);
struct irq_desc;
extern int early_irq_init(void);
+ extern int arch_probe_nr_irqs(void);
extern int arch_early_irq_init(void);
extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
*/
struct irq_desc {
unsigned int irq;
-#ifdef CONFIG_SPARSE_IRQ
struct timer_rand_state *timer_rand_state;
unsigned int *kstat_irqs;
-# ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_INTR_REMAP
struct irq_2_iommu *irq_2_iommu;
-# endif
#endif
irq_flow_handler_t handle_irq;
struct irq_chip *chip;
unsigned int irqs_unhandled;
spinlock_t lock;
#ifdef CONFIG_SMP
- cpumask_t affinity;
+ cpumask_var_t affinity;
unsigned int cpu;
- #endif
#ifdef CONFIG_GENERIC_PENDING_IRQ
- cpumask_t pending_mask;
+ cpumask_var_t pending_mask;
+ #endif
#endif
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *dir;
extern struct irq_desc irq_desc[NR_IRQS];
#else /* CONFIG_SPARSE_IRQ */
extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
-
-#define kstat_irqs_this_cpu(DESC) \
- ((DESC)->kstat_irqs[smp_processor_id()])
-#define kstat_incr_irqs_this_cpu(irqno, DESC) \
- ((DESC)->kstat_irqs[smp_processor_id()]++)
-
#endif /* CONFIG_SPARSE_IRQ */
extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
* Migration helpers for obsolete names, they will go away:
*/
#define hw_interrupt_type irq_chip
-typedef struct irq_chip hw_irq_controller;
#define no_irq_type no_irq_chip
typedef struct irq_desc irq_desc_t;
#include <asm/hw_irq.h>
extern int setup_irq(unsigned int irq, struct irqaction *new);
+extern void remove_irq(unsigned int irq, struct irqaction *act);
#ifdef CONFIG_GENERIC_HARDIRQS
}
/* Handle irq action chains: */
-extern int handle_IRQ_event(unsigned int irq, struct irqaction *action);
+extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action);
/*
* Built-in IRQ handlers for various IRQ types,
/* Handling of unhandled and spurious interrupts: */
extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
- int action_ret);
+ irqreturn_t action_ret);
/* Resending of interrupts :*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq);
#endif /* !CONFIG_S390 */
+ #ifdef CONFIG_SMP
+ /**
+ * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * @desc: pointer to irq_desc struct
+ * @cpu: cpu which will be handling the cpumasks
+ * @boot: true if need bootmem
+ *
+ * Allocates affinity and pending_mask cpumask if required.
+ * Returns true if successful (or not required).
+ * Side effect: affinity has all bits set, pending_mask has all bits clear.
+ */
+ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+ bool boot)
+ {
+ int node;
+
+ if (boot) {
+ alloc_bootmem_cpumask_var(&desc->affinity);
+ cpumask_setall(desc->affinity);
+
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+ alloc_bootmem_cpumask_var(&desc->pending_mask);
+ cpumask_clear(desc->pending_mask);
+ #endif
+ return true;
+ }
+
+ node = cpu_to_node(cpu);
+
+ if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+ return false;
+ cpumask_setall(desc->affinity);
+
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+ free_cpumask_var(desc->affinity);
+ return false;
+ }
+ cpumask_clear(desc->pending_mask);
+ #endif
+ return true;
+ }
+
+ /**
+ * init_copy_desc_masks - copy cpumasks for irq_desc
+ * @old_desc: pointer to old irq_desc struct
+ * @new_desc: pointer to new irq_desc struct
+ *
+ * Insures affinity and pending_masks are copied to new irq_desc.
+ * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
+ * irq_desc struct so the copy is redundant.
+ */
+
+ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+ struct irq_desc *new_desc)
+ {
+ #ifdef CONFIG_CPUMASKS_OFFSTACK
+ cpumask_copy(new_desc->affinity, old_desc->affinity);
+
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+ cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
+ #endif
+ #endif
+ }
+
+ #else /* !CONFIG_SMP */
+
+ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+ bool boot)
+ {
+ return true;
+ }
+
+ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+ struct irq_desc *new_desc)
+ {
+ }
+
+ #endif /* CONFIG_SMP */
+
#endif /* _LINUX_IRQ_H */
# define for_each_irq_desc_reverse(irq, desc) \
for (irq = nr_irqs - 1; irq >= 0; irq--)
+
#else /* CONFIG_GENERIC_HARDIRQS */
extern int nr_irqs;
# define for_each_irq_desc(irq, desc) \
for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \
irq++, desc = irq_to_desc(irq)) \
- if (desc)
+ if (!desc) \
+ ; \
+ else
# define for_each_irq_desc_reverse(irq, desc) \
for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; \
irq--, desc = irq_to_desc(irq)) \
- if (desc)
+ if (!desc) \
+ ; \
+ else
#endif /* CONFIG_GENERIC_HARDIRQS */
struct rq *busiest, struct sched_domain *sd,
enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ int (*needs_post_schedule) (struct rq *this_rq);
void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
u64 last_wakeup;
u64 avg_overlap;
+ u64 start_runtime;
+ u64 avg_wakeup;
+ u64 nr_migrations;
+
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
u64 exec_max;
u64 slice_max;
- u64 nr_migrations;
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
#endif
struct list_head tasks;
+ struct plist_node pushable_tasks;
struct mm_struct *mm, *active_mm;
/* ??? */
unsigned int personality;
unsigned did_exec:1;
+ unsigned in_execve:1; /* Tell the LSMs that the process is doing an
+ * execve */
pid_t pid;
pid_t tgid;
- #ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
- #endif
+
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
#endif
};
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
+
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
return set_cpus_allowed_ptr(p, &new_mask);
}
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+extern int sched_clock_stable;
+#endif
+
extern unsigned long long sched_clock(void);
extern void sched_clock_init(void);
extern void thread_info_cache_init(void);
+ #ifdef CONFIG_DEBUG_STACK_USAGE
+ static inline unsigned long stack_not_used(struct task_struct *p)
+ {
+ unsigned long *n = end_of_stack(p);
+
+ do { /* Skip over canary */
+ n++;
+ } while (!*n);
+
+ return (unsigned long)n - (unsigned long)end_of_stack(p);
+ }
+ #endif
+
/* set thread flags in other task's structures
* - see asm/thread_info.h for TIF_xxxx flags available
*/
extern int sched_group_set_rt_period(struct task_group *tg,
long rt_period_us);
extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
#endif
#endif
+extern int task_can_switch_user(struct user_struct *up,
+ struct task_struct *tsk);
+
#ifdef CONFIG_TASK_XACCT
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
which is done within the script "scripts/setlocalversion".)
+ config HAVE_KERNEL_GZIP
+ bool
+
+ config HAVE_KERNEL_BZIP2
+ bool
+
+ config HAVE_KERNEL_LZMA
+ bool
+
+ choice
+ prompt "Kernel compression mode"
+ default KERNEL_GZIP
+ depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA
+ help
+ The linux kernel is a kind of self-extracting executable.
+ Several compression algorithms are available, which differ
+ in efficiency, compression and decompression speed.
+ Compression speed is only relevant when building a kernel.
+ Decompression speed is relevant at each boot.
+
+ If you have any problems with bzip2 or lzma compressed
+ version of this functionality (bzip2 only), for 2.4, was
+ supplied by Christian Ludwig)
+
+ High compression options are mostly useful for users, who
+ are low on disk space (embedded systems), but for whom ram
+ size matters less.
+
+ If in doubt, select 'gzip'
+
+ config KERNEL_GZIP
+ bool "Gzip"
+ depends on HAVE_KERNEL_GZIP
+ help
+ The old and tried gzip compression. Its compression ratio is
+ the poorest among the 3 choices; however its speed (both
+ compression and decompression) is the fastest.
+
+ config KERNEL_BZIP2
+ bool "Bzip2"
+ depends on HAVE_KERNEL_BZIP2
+ help
+ Its compression ratio and speed is intermediate.
+ Decompression speed is slowest among the three. The kernel
+ size is about 10% smaller with bzip2, in comparison to gzip.
+ Bzip2 uses a large amount of memory. For modern kernels you
+ will need at least 8MB RAM or more for booting.
+
+ config KERNEL_LZMA
+ bool "LZMA"
+ depends on HAVE_KERNEL_LZMA
+ help
+ The most recent compression algorithm.
+ Its ratio is best, decompression speed is between the other
+ two. Compression is slowest. The kernel size is about 33%
+ smaller with LZMA in comparison to gzip.
+
+ endchoice
+
config SWAP
bool "Support for paging of anonymous memory (swap)"
depends on MMU && BLOCK
config SYSCTL
bool
+config ANON_INODES
+ bool
+
menuconfig EMBEDDED
bool "Configure standard kernel features (for small systems)"
help
This option allows to disable the internal PC-Speaker
support, saving some memory.
-config COMPAT_BRK
- bool "Disable heap randomization"
- default y
- help
- Randomizing heap placement makes heap exploits harder, but it
- also breaks ancient binaries (including anything libc5 based).
- This option changes the bootup default to heap randomization
- disabled, and can be overriden runtime by setting
- /proc/sys/kernel/randomize_va_space to 2.
-
- On non-ancient distros (post-2000 ones) N is usually a safe choice.
-
config BASE_FULL
default y
bool "Enable full-sized data structures for core" if EMBEDDED
support for "fast userspace mutexes". The resulting kernel may not
run glibc-based applications correctly.
-config ANON_INODES
- bool
-
config EPOLL
bool "Enable eventpoll support" if EMBEDDED
default y
SLUB sysfs support. /sys/slab will not exist and there will be
no support for cache validation etc.
+config COMPAT_BRK
+ bool "Disable heap randomization"
+ default y
+ help
+ Randomizing heap placement makes heap exploits harder, but it
+ also breaks ancient binaries (including anything libc5 based).
+ This option changes the bootup default to heap randomization
+ disabled, and can be overriden runtime by setting
+ /proc/sys/kernel/randomize_va_space to 2.
+
+ On non-ancient distros (post-2000 ones) N is usually a safe choice.
+
choice
prompt "Choose SLAB allocator"
default SLUB
config RT_MUTEXES
boolean
- select PLIST
config BASE_SMALL
int
#include <linux/proc_fs.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
+ #include <linux/stackprotector.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/delay.h>
extern void tc_init(void);
#endif
-enum system_states system_state;
+enum system_states system_state __read_mostly;
EXPORT_SYMBOL(system_state);
/*
* greater than 0, limits the maximum number of CPUs activated in
* SMP mode to <NUM>.
*/
- #ifndef CONFIG_X86_IO_APIC
- static inline void disable_ioapic_setup(void) {};
- #endif
+
+ void __weak arch_disable_smp_support(void) { }
static int __init nosmp(char *str)
{
setup_max_cpus = 0;
- disable_ioapic_setup();
+ arch_disable_smp_support();
+
return 0;
}
{
get_option(&str, &setup_max_cpus);
if (setup_max_cpus == 0)
- disable_ioapic_setup();
+ arch_disable_smp_support();
return 0;
}
early_param("maxcpus", maxcpus);
#else
- #define setup_max_cpus NR_CPUS
+ const unsigned int setup_max_cpus = NR_CPUS;
#endif
/*
* at least once to get things moving:
*/
init_idle_bootup_task(current);
+ rcu_scheduler_starting();
preempt_enable_no_resched();
schedule();
preempt_disable();
*/
lockdep_init();
debug_objects_early_init();
+
+ /*
+ * Set up the the initial canary ASAP:
+ */
+ boot_init_stack_canary();
+
cgroup_init_early();
local_irq_disable();
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <trace/sched.h>
+ #include <linux/magic.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
{
struct task_struct *tsk;
struct thread_info *ti;
+ unsigned long *stackend;
+
int err;
prepare_to_copy(orig);
goto out;
setup_thread_stack(tsk, orig);
+ stackend = end_of_stack(tsk);
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_int();
#endif
clear_all_latency_tracing(p);
- /* Our parent execution domain becomes current domain
- These must match for thread signalling to apply */
- p->parent_exec_id = p->self_exec_id;
-
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
set_task_cpu(p, smp_processor_id());
/* CLONE_PARENT re-uses the old parent */
- if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
- else
+ p->parent_exec_id = current->parent_exec_id;
+ } else {
p->real_parent = current;
+ p->parent_exec_id = current->self_exec_id;
+ }
spin_lock(¤t->sighand->siglock);
desc->irq_count = 0;
desc->irqs_unhandled = 0;
#ifdef CONFIG_SMP
- cpumask_setall(&desc->affinity);
+ cpumask_setall(desc->affinity);
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+ cpumask_clear(desc->pending_mask);
+ #endif
#endif
spin_unlock_irqrestore(&desc->lock, flags);
}
desc->handle_irq = handle_bad_irq;
desc->chip = &no_irq_chip;
desc->name = NULL;
+ clear_kstat_irqs(desc);
spin_unlock_irqrestore(&desc->lock, flags);
}
desc->chip->mask_ack(irq);
else {
desc->chip->mask(irq);
- desc->chip->ack(irq);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
}
}
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
- desc->chip->ack(irq);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
desc = irq_remap_to_desc(irq, desc);
/* Mark the IRQ currently in progress.*/
#include <linux/kernel_stat.h>
#include <linux/rculist.h>
#include <linux/hash.h>
+ #include <linux/bootmem.h>
#include "internals.h"
EXPORT_SYMBOL_GPL(nr_irqs);
#ifdef CONFIG_SPARSE_IRQ
+
static struct irq_desc irq_desc_init = {
.irq = -1,
.status = IRQ_DISABLED,
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
- #ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
- #endif
};
void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
{
- unsigned long bytes;
- char *ptr;
int node;
-
- /* Compute how many bytes we need per irq and allocate them */
- bytes = nr * sizeof(unsigned int);
+ void *ptr;
node = cpu_to_node(cpu);
- ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
- printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+ ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
- if (ptr)
- desc->kstat_irqs = (unsigned int *)ptr;
+ /*
+ * don't overwite if can not get new one
+ * init_copy_kstat_irqs() could still use old one
+ */
+ if (ptr) {
+ printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n",
+ cpu, node);
+ desc->kstat_irqs = ptr;
+ }
}
static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
printk(KERN_ERR "can not alloc kstat_irqs\n");
BUG_ON(1);
}
+ if (!init_alloc_desc_masks(desc, cpu, false)) {
+ printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+ BUG_ON(1);
+ }
arch_init_chip_data(desc, cpu);
}
*/
DEFINE_SPINLOCK(sparse_irq_lock);
- struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+ struct irq_desc **irq_desc_ptrs __read_mostly;
static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS_LEGACY-1] = {
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
- #ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
- #endif
}
};
- /* FIXME: use bootmem alloc ...*/
- static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+ static unsigned int *kstat_irqs_legacy;
int __init early_irq_init(void)
{
init_irq_default_affinity();
+ /* initialize nr_irqs based on nr_cpu_ids */
+ arch_probe_nr_irqs();
+ printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
+
desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ /* allocate irq_desc_ptrs array based on nr_irqs */
+ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+
+ /* allocate based on nr_cpu_ids */
+ /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
+ kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
+ sizeof(int));
+
for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
- desc[i].kstat_irqs = kstat_irqs_legacy[i];
+ desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-
+ init_alloc_desc_masks(&desc[i], 0, true);
irq_desc_ptrs[i] = desc + i;
}
- for (i = legacy_count; i < NR_IRQS; i++)
+ for (i = legacy_count; i < nr_irqs; i++)
irq_desc_ptrs[i] = NULL;
return arch_early_irq_init();
struct irq_desc *irq_to_desc(unsigned int irq)
{
- return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+ if (irq_desc_ptrs && irq < nr_irqs)
+ return irq_desc_ptrs[irq];
+
+ return NULL;
}
struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
unsigned long flags;
int node;
- if (irq >= NR_IRQS) {
- printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
- irq, NR_IRQS);
- WARN_ON(1);
+ if (irq >= nr_irqs) {
+ WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
+ irq, nr_irqs);
return NULL;
}
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
- #ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
- #endif
}
};
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
int __init early_irq_init(void)
{
struct irq_desc *desc;
init_irq_default_affinity();
+ printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
desc[i].irq = i;
+ init_alloc_desc_masks(&desc[i], 0, true);
+ desc[i].kstat_irqs = kstat_irqs_all[i];
}
-
return arch_early_irq_init();
}
}
#endif /* !CONFIG_SPARSE_IRQ */
+void clear_kstat_irqs(struct irq_desc *desc)
+{
+ memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+}
+
/*
* What should we do if we get a hw irq event on an illegal vector?
* Each architecture has to answer this themself.
irqreturn_t ret, retval = IRQ_NONE;
unsigned int status = 0;
+ WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
+
if (!(action->flags & IRQF_DISABLED))
local_irq_enable_in_hardirq();
}
#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+
+#ifdef CONFIG_ENABLE_WARN_DEPRECATED
+# warning __do_IRQ is deprecated. Please convert to proper flow handlers
+#endif
+
/**
* __do_IRQ - original all in one highlevel IRQ handler
* @irq: the interrupt number
}
}
-#ifdef CONFIG_SPARSE_IRQ
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
return desc ? desc->kstat_irqs[cpu] : 0;
}
-#endif
EXPORT_SYMBOL(kstat_irqs_cpu);
extern struct lock_class_key irq_desc_lock_class;
extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void clear_kstat_irqs(struct irq_desc *desc);
extern spinlock_t sparse_irq_lock;
+
+ #ifdef CONFIG_SPARSE_IRQ
+ /* irq_desc_ptrs allocated at boot time */
+ extern struct irq_desc **irq_desc_ptrs;
+ #else
+ /* irq_desc_ptrs is a fixed size array */
extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+ #endif
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- cpumask_copy(&desc->affinity, cpumask);
+ cpumask_copy(desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
} else {
desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(&desc->pending_mask, cpumask);
+ cpumask_copy(desc->pending_mask, cpumask);
}
#else
- cpumask_copy(&desc->affinity, cpumask);
+ cpumask_copy(desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
#endif
desc->status |= IRQ_AFFINITY_SET;
/*
* Generic version of the affinity autoselector.
*/
-int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
+static int setup_affinity(unsigned int irq, struct irq_desc *desc)
{
if (!irq_can_set_affinity(irq))
return 0;
* one of the targets is online.
*/
if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+ if (cpumask_any_and(desc->affinity, cpu_online_mask)
< nr_cpu_ids)
goto set_affinity;
else
desc->status &= ~IRQ_AFFINITY_SET;
}
- cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+ cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
set_affinity:
- desc->chip->set_affinity(irq, &desc->affinity);
+ desc->chip->set_affinity(irq, desc->affinity);
return 0;
}
#else
-static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
{
return irq_select_affinity(irq);
}
int ret;
spin_lock_irqsave(&desc->lock, flags);
- ret = do_irq_select_affinity(irq, desc);
+ ret = setup_affinity(irq, desc);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
#else
-static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
+static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
{
return 0;
}
* allocate special interrupts that are part of the architecture.
*/
static int
-__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
+__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
- struct irqaction *old, **p;
+ struct irqaction *old, **old_ptr;
const char *old_name = NULL;
unsigned long flags;
int shared = 0;
* The following block of code has to be executed atomically
*/
spin_lock_irqsave(&desc->lock, flags);
- p = &desc->action;
- old = *p;
+ old_ptr = &desc->action;
+ old = *old_ptr;
if (old) {
/*
* Can't share interrupts unless both agree to and are
/* add new interrupt at end of irq queue */
do {
- p = &old->next;
- old = *p;
+ old_ptr = &old->next;
+ old = *old_ptr;
} while (old);
shared = 1;
}
desc->status |= IRQ_NO_BALANCING;
/* Set default affinity mask once everything is setup */
- do_irq_select_affinity(irq, desc);
+ setup_affinity(irq, desc);
} else if ((new->flags & IRQF_TRIGGER_MASK)
&& (new->flags & IRQF_TRIGGER_MASK)
(int)(new->flags & IRQF_TRIGGER_MASK));
}
- *p = new;
+ *old_ptr = new;
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
return __setup_irq(irq, desc, act);
}
+EXPORT_SYMBOL_GPL(setup_irq);
-/**
- * free_irq - free an interrupt
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
- *
- * Remove an interrupt handler. The handler is removed and if the
- * interrupt line is no longer in use by any driver it is disabled.
- * On a shared IRQ the caller must ensure the interrupt is disabled
- * on the card it drives before calling this function. The function
- * does not return until any executing interrupts for this IRQ
- * have completed.
- *
- * This function must not be called from interrupt context.
+ /*
+ * Internal function to unregister an irqaction - used to free
+ * regular and special interrupts that are part of the architecture.
*/
-void free_irq(unsigned int irq, void *dev_id)
+static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction **p;
+ struct irqaction *action, **action_ptr;
unsigned long flags;
- WARN_ON(in_interrupt());
+ WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
if (!desc)
- return;
+ return NULL;
spin_lock_irqsave(&desc->lock, flags);
- p = &desc->action;
+
+ /*
+ * There can be multiple actions per IRQ descriptor, find the right
+ * one based on the dev_id:
+ */
+ action_ptr = &desc->action;
for (;;) {
- struct irqaction *action = *p;
+ action = *action_ptr;
- if (action) {
- struct irqaction **pp = p;
+ if (!action) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ spin_unlock_irqrestore(&desc->lock, flags);
- p = &action->next;
- if (action->dev_id != dev_id)
- continue;
+ return NULL;
+ }
- /* Found it - now remove it from the list of entries */
- *pp = action->next;
+ if (action->dev_id == dev_id)
+ break;
+ action_ptr = &action->next;
+ }
- /* Currently used only by UML, might disappear one day.*/
+ /* Found it - now remove it from the list of entries: */
+ *action_ptr = action->next;
+
+ /* Currently used only by UML, might disappear one day: */
#ifdef CONFIG_IRQ_RELEASE_METHOD
- if (desc->chip->release)
- desc->chip->release(irq, dev_id);
+ if (desc->chip->release)
+ desc->chip->release(irq, dev_id);
#endif
- if (!desc->action) {
- desc->status |= IRQ_DISABLED;
- if (desc->chip->shutdown)
- desc->chip->shutdown(irq);
- else
- desc->chip->disable(irq);
- }
- spin_unlock_irqrestore(&desc->lock, flags);
- unregister_handler_proc(irq, action);
+ /* If this was the last handler, shut down the IRQ line: */
+ if (!desc->action) {
+ desc->status |= IRQ_DISABLED;
+ if (desc->chip->shutdown)
+ desc->chip->shutdown(irq);
+ else
+ desc->chip->disable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ unregister_handler_proc(irq, action);
+
+ /* Make sure it's not being used on another CPU: */
+ synchronize_irq(irq);
- /* Make sure it's not being used on another CPU */
- synchronize_irq(irq);
-#ifdef CONFIG_DEBUG_SHIRQ
- /*
- * It's a shared IRQ -- the driver ought to be
- * prepared for it to happen even now it's
- * being freed, so let's make sure.... We do
- * this after actually deregistering it, to
- * make sure that a 'real' IRQ doesn't run in
- * parallel with our fake
- */
- if (action->flags & IRQF_SHARED) {
- local_irq_save(flags);
- action->handler(irq, dev_id);
- local_irq_restore(flags);
- }
-#endif
- kfree(action);
- return;
- }
- printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
#ifdef CONFIG_DEBUG_SHIRQ
- dump_stack();
-#endif
- spin_unlock_irqrestore(&desc->lock, flags);
- return;
+ /*
+ * It's a shared IRQ -- the driver ought to be prepared for an IRQ
+ * event to happen even now it's being freed, so let's make sure that
+ * is so by doing an extra call to the handler ....
+ *
+ * ( We do this after actually deregistering it, to make sure that a
+ * 'real' IRQ doesn't run in * parallel with our fake. )
+ */
+ if (action->flags & IRQF_SHARED) {
+ local_irq_save(flags);
+ action->handler(irq, dev_id);
+ local_irq_restore(flags);
}
+#endif
+ return action;
+}
+
+/**
+ * remove_irq - free an interrupt
+ * @irq: Interrupt line to free
+ * @act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_irq(unsigned int irq, struct irqaction *act)
+{
+ __free_irq(irq, act->dev_id);
+}
+EXPORT_SYMBOL_GPL(remove_irq);
+
+/**
+ * free_irq - free an interrupt allocated with request_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the
+ * interrupt line is no longer in use by any driver it is disabled.
+ * On a shared IRQ the caller must ensure the interrupt is disabled
+ * on the card it drives before calling this function. The function
+ * does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+ kfree(__free_irq(irq, dev_id));
}
EXPORT_SYMBOL(free_irq);
* the behavior is classified as "will not fix" so we need to
* start nudging drivers away from using that idiom.
*/
- if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
- == (IRQF_SHARED|IRQF_DISABLED))
- pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
- "guaranteed on shared IRQs\n",
- irq, devname);
+ if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
+ (IRQF_SHARED|IRQF_DISABLED)) {
+ pr_warning(
+ "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
+ irq, devname);
+ }
#ifdef CONFIG_LOCKDEP
/*
if (!handler)
return -EINVAL;
- action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
return -ENOMEM;
action->handler = handler;
action->flags = irqflags;
- cpus_clear(action->mask);
action->name = devname;
- action->next = NULL;
action->dev_id = dev_id;
retval = __setup_irq(irq, desc, action);
struct irq_desc *desc,
int cpu, int nr)
{
- unsigned long bytes;
-
init_kstat_irqs(desc, cpu, nr);
- if (desc->kstat_irqs != old_desc->kstat_irqs) {
- /* Compute how many bytes we need per irq and allocate them */
- bytes = nr * sizeof(unsigned int);
-
- memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
- }
+ if (desc->kstat_irqs != old_desc->kstat_irqs)
+ memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
+ nr * sizeof(*desc->kstat_irqs));
}
static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
old_desc->kstat_irqs = NULL;
}
- static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
struct irq_desc *desc, int cpu)
{
memcpy(desc, old_desc, sizeof(struct irq_desc));
+ if (!init_alloc_desc_masks(desc, cpu, false)) {
+ printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+ "for migration.\n", irq);
+ return false;
+ }
spin_lock_init(&desc->lock);
desc->cpu = cpu;
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+ init_copy_desc_masks(old_desc, desc);
arch_init_copy_chip_data(old_desc, desc, cpu);
+ return true;
}
static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
node = cpu_to_node(cpu);
desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
if (!desc) {
- printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+ printk(KERN_ERR "irq %d: can not get new irq_desc "
+ "for migration.\n", irq);
+ /* still use old one */
+ desc = old_desc;
+ goto out_unlock;
+ }
+ if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
/* still use old one */
+ kfree(desc);
desc = old_desc;
goto out_unlock;
}
- init_copy_one_irq_desc(irq, old_desc, desc, cpu);
irq_desc_ptrs[irq] = desc;
spin_unlock_irqrestore(&sparse_irq_lock, flags);
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/async.h>
+ #include <linux/percpu.h>
#if 0
#define DEBUGP printk
}
#ifdef CONFIG_SMP
+
+ #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+ static void *percpu_modalloc(unsigned long size, unsigned long align,
+ const char *name)
+ {
+ void *ptr;
+
+ if (align > PAGE_SIZE) {
+ printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+ name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
+ }
+
+ ptr = __alloc_reserved_percpu(size, align);
+ if (!ptr)
+ printk(KERN_WARNING
+ "Could not allocate %lu bytes percpu data\n", size);
+ return ptr;
+ }
+
+ static void percpu_modfree(void *freeme)
+ {
+ free_percpu(freeme);
+ }
+
+ #else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
/* Number of blocks used and allocated. */
static unsigned int pcpu_num_used, pcpu_num_allocated;
/* Size of each block. -ve means used. */
}
}
- static unsigned int find_pcpusec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
- {
- return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
- }
-
- static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
- {
- int cpu;
-
- for_each_possible_cpu(cpu)
- memcpy(pcpudest + per_cpu_offset(cpu), from, size);
- }
-
static int percpu_modinit(void)
{
pcpu_num_used = 2;
return 0;
}
__initcall(percpu_modinit);
+
+ #endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+ {
+ return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+ }
+
+ static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+ {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+ }
+
#else /* ... !CONFIG_SMP */
+
static inline void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
{
/* pcpusec should be 0, and size of that section should be 0. */
BUG_ON(size != 0);
}
+
#endif /* CONFIG_SMP */
#define MODINFO_ATTR(field) \
mutex_lock(&module_mutex);
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
- unregister_dynamic_debug_module(mod->name);
+ ddebug_remove_module(mod->name);
free_module(mod);
out:
}
#endif /* CONFIG_KALLSYMS */
-static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
+static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
{
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
- unsigned int i;
-
- for (i = 0; i < num; i++) {
- register_dynamic_debug_module(debug[i].modname,
- debug[i].type,
- debug[i].logical_modname,
- debug[i].flag_names,
- debug[i].hash, debug[i].hash2);
- }
-#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+#ifdef CONFIG_DYNAMIC_DEBUG
+ if (ddebug_add_module(debug, num, debug->modname))
+ printk(KERN_ERR "dynamic debug error adding module: %s\n",
+ debug->modname);
+#endif
}
static void *module_alloc_update_bounds(unsigned long size)
if (err < 0)
goto free_mod;
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
- mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
- mod->name);
- if (!mod->refptr) {
- err = -ENOMEM;
- goto free_mod;
- }
-#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
mod->name);
if (!percpu) {
err = -ENOMEM;
- goto free_percpu;
+ goto free_mod;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
/* Module has been moved. */
mod = (void *)sechdrs[modindex].sh_addr;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+ mod->name);
+ if (!mod->refptr) {
+ err = -ENOMEM;
+ goto free_init;
+ }
+#endif
/* Now we've moved module, initialize linked lists, etc. */
module_unload_init(mod);
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
if (!mod->taints) {
- struct mod_debug *debug;
+ struct _ddebug *debug;
unsigned int num_debug;
debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
sizeof(*debug), &num_debug);
- dynamic_printk_setup(debug, num_debug);
+ if (debug)
+ dynamic_debug_setup(debug, num_debug);
}
/* sechdrs[0].sh_size is always zero */
ftrace_release(mod->module_core, mod->core_size);
free_unload:
module_unload_free(mod);
+ free_init:
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ percpu_modfree(mod->refptr);
+#endif
module_free(mod, mod->module_init);
free_core:
module_free(mod, mod->module_core);
+ /* mod will be freed with core. Don't access it beyond this line! */
free_percpu:
if (percpu)
percpu_modfree(percpu);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
- percpu_modfree(mod->refptr);
-#endif
free_mod:
kfree(args);
free_hdr:
{
ktime_t now;
- if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
*/
static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+ return list_empty(&root_task_group.children);
+}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+ return 1;
+}
+#endif
+
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline struct task_group *task_group(struct task_struct *p)
{
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- int highest_prio; /* highest queued rt task prio */
+ struct {
+ int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+ int next; /* next highest */
+#endif
+ } highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
int overloaded;
+ struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned long last_tick_seen;
unsigned char in_nohz_recently;
struct root_domain *rd;
struct sched_domain *sd;
+ unsigned char idle_at_tick;
/* For active balancing */
int active_balance;
int push_cpu;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
- unsigned int yld_exp_empty;
- unsigned int yld_act_empty;
- unsigned int yld_both_empty;
unsigned int yld_count;
/* schedule() stats */
assert_spin_locked(&task_rq(p)->lock);
- if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ if (test_tsk_need_resched(p))
return;
- set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id())
* lockless. The worst case is that the other CPU runs the
* idle task through an additional NOOP schedule()
*/
- set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+ set_tsk_need_resched(rq->idle);
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
#endif
+#ifdef CONFIG_PREEMPT
+
/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations. This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below. However, it
+ * also adds more overhead and therefore may reduce throughput.
*/
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ spin_unlock(&this_rq->lock);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry. This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
int ret = 0;
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
return ret;
}
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+
+ return _double_lock_balance(this_rq, busiest);
+}
+
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
{
+ if (wakeup)
+ p->se.start_runtime = p->se.sum_exec_runtime;
+
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup);
p->se.on_rq = 1;
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- if (sleep && p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
+ if (sleep) {
+ if (p->se.last_wakeup) {
+ update_avg(&p->se.avg_overlap,
+ p->se.sum_exec_runtime - p->se.last_wakeup);
+ p->se.last_wakeup = 0;
+ } else {
+ update_avg(&p->se.avg_wakeup,
+ sysctl_sched_wakeup_granularity);
+ }
}
sched_info_dequeued(p);
* it must be off the runqueue _entirely_, and not
* preempted!
*
- * So if it wa still runnable (but just not actively
+ * So if it was still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
sync = 0;
#ifdef CONFIG_SMP
- if (sched_feat(LB_WAKEUP_UPDATE)) {
+ if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
struct sched_domain *sd;
this_cpu = raw_smp_processor_id();
activate_task(rq, p, 1);
success = 1;
+ /*
+ * Only attribute actual wakeups done by this task.
+ */
+ if (!in_interrupt()) {
+ struct sched_entity *se = ¤t->se;
+ u64 sample = se->sum_exec_runtime;
+
+ if (se->last_wakeup)
+ sample -= se->last_wakeup;
+ else
+ sample -= se->start_runtime;
+ update_avg(&se->avg_wakeup, sample);
+
+ se->last_wakeup = se->sum_exec_runtime;
+ }
+
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
p->sched_class->task_wake_up(rq, p);
#endif
out:
- current->se.last_wakeup = current->se.sum_exec_runtime;
-
task_rq_unlock(rq, &flags);
return success;
p->se.prev_sum_exec_runtime = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
+ p->se.start_runtime = 0;
+ p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+ plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
put_cpu();
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
/**
- * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+#ifdef CONFIG_SMP
+ int post_schedule = 0;
+
+ if (current->sched_class->needs_post_schedule)
+ post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
rq->prev_mm = NULL;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
- if (current->sched_class->post_schedule)
+ if (post_schedule)
current->sched_class->post_schedule(rq);
#endif
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
+ int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
* 1) running (obviously), or
* 2) too many balance attempts have failed.
*/
- if (!task_hot(p, rq->clock, sd) ||
- sd->nr_balance_failed > sd->cache_nice_tries) {
+ tsk_cache_hot = task_hot(p, rq->clock, sd);
+ if (!tsk_cache_hot ||
+ sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
schedstat_inc(p, se.nr_forced_migrations);
}
return 1;
}
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(p, se.nr_failed_migrations_hot);
return 0;
}
pulled++;
rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible kernels
+ * will stop after the first task is pulled to minimize the critical
+ * section.
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ goto out;
+#endif
+
/*
* We only want to steal up to the prescribed amount of weighted load.
*/
sd, idle, all_pinned, &this_best_prio);
class = class->next;
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible
+ * kernels will stop after the first task is pulled to minimize
+ * the critical section.
+ */
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
-
+#endif
} while (class && max_load_move > total_load_moved);
return total_load_moved > 0;
return 0;
}
+/********** Helpers for find_busiest_group ************************/
+/**
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *this; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ /** Statistics of this group */
+ unsigned long this_load;
+ unsigned long this_load_per_task;
+ unsigned long this_nr_running;
+
+ /* Statistics of the busiest group */
+ unsigned long max_load;
+ unsigned long busiest_load_per_task;
+ unsigned long busiest_nr_running;
+
+ int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ int power_savings_balance; /* Is powersave balance needed for this sd */
+ struct sched_group *group_min; /* Least loaded group in sd */
+ struct sched_group *group_leader; /* Group which relieves group_min */
+ unsigned long min_load_per_task; /* load_per_task in group_min */
+ unsigned long leader_nr_running; /* Nr running of group_leader */
+ unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+/**
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+ unsigned long avg_load; /*Avg load across the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long sum_nr_running; /* Nr tasks running in the group */
+ unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long group_capacity;
+ int group_imb; /* Is there an imbalance in the group ? */
+};
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
*/
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const struct cpumask *cpus, int *balance)
+static inline unsigned int group_first_cpu(struct sched_group *group)
{
- struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- unsigned long max_pull;
- unsigned long busiest_load_per_task, busiest_nr_running;
- unsigned long this_load_per_task, this_nr_running;
- int load_idx, group_imb = 0;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance = 1;
- unsigned long leader_nr_running = 0, min_load_per_task = 0;
- unsigned long min_nr_running = ULONG_MAX;
- struct sched_group *group_min = NULL, *group_leader = NULL;
-#endif
+ return cpumask_first(sched_group_cpus(group));
+}
- max_load = this_load = total_load = total_pwr = 0;
- busiest_load_per_task = busiest_nr_running = 0;
- this_load_per_task = this_nr_running = 0;
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ int load_idx;
- if (idle == CPU_NOT_IDLE)
+ switch (idle) {
+ case CPU_NOT_IDLE:
load_idx = sd->busy_idx;
- else if (idle == CPU_NEWLY_IDLE)
+ break;
+
+ case CPU_NEWLY_IDLE:
load_idx = sd->newidle_idx;
- else
+ break;
+ default:
load_idx = sd->idle_idx;
+ break;
+ }
- do {
- unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
- int local_group;
- int i;
- int __group_imb = 0;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long sum_nr_running, sum_weighted_load;
- unsigned long sum_avg_load_per_task;
- unsigned long avg_load_per_task;
+ return load_idx;
+}
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
- if (local_group)
- balance_cpu = cpumask_first(sched_group_cpus(group));
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+ /*
+ * Busy processors will not participate in power savings
+ * balance.
+ */
+ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ sds->power_savings_balance = 0;
+ else {
+ sds->power_savings_balance = 1;
+ sds->min_nr_running = ULONG_MAX;
+ sds->leader_nr_running = 0;
+ }
+}
- /* Tally up the load of all CPUs in the group */
- sum_weighted_load = sum_nr_running = avg_load = 0;
- sum_avg_load_per_task = avg_load_per_task = 0;
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ * load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+
+ if (!sds->power_savings_balance)
+ return;
- max_cpu_load = 0;
- min_cpu_load = ~0UL;
+ /*
+ * If the local group is idle or completely loaded
+ * no need to do power savings balance at this domain
+ */
+ if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+ !sds->this_nr_running))
+ sds->power_savings_balance = 0;
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- struct rq *rq = cpu_rq(i);
+ /*
+ * If a group is already running at full capacity or idle,
+ * don't include that group in power savings calculations
+ */
+ if (!sds->power_savings_balance ||
+ sgs->sum_nr_running >= sgs->group_capacity ||
+ !sgs->sum_nr_running)
+ return;
- if (*sd_idle && rq->nr_running)
- *sd_idle = 0;
+ /*
+ * Calculate the group which has the least non-idle load.
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sgs->sum_nr_running < sds->min_nr_running) ||
+ (sgs->sum_nr_running == sds->min_nr_running &&
+ group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+ sds->group_min = group;
+ sds->min_nr_running = sgs->sum_nr_running;
+ sds->min_load_per_task = sgs->sum_weighted_load /
+ sgs->sum_nr_running;
+ }
- /* Bias balancing toward cpus of our domain */
- if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
- first_idle_cpu = 1;
- balance_cpu = i;
- }
+ /*
+ * Calculate the group which is almost near its
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sgs->sum_nr_running > sgs->group_capacity - 1)
+ return;
- load = target_load(i, load_idx);
- } else {
- load = source_load(i, load_idx);
- if (load > max_cpu_load)
- max_cpu_load = load;
- if (min_cpu_load > load)
- min_cpu_load = load;
- }
+ if (sgs->sum_nr_running > sds->leader_nr_running ||
+ (sgs->sum_nr_running == sds->leader_nr_running &&
+ group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+ sds->group_leader = group;
+ sds->leader_nr_running = sgs->sum_nr_running;
+ }
+}
- avg_load += load;
- sum_nr_running += rq->nr_running;
- sum_weighted_load += weighted_cpuload(i);
+/**
+ * check_power_save_busiest_group - Check if we have potential to perform
+ * some power-savings balance. If yes, set the busiest group to be
+ * the least loaded group in the sched_domain, so that it's CPUs can
+ * be put to idle.
+ *
+ * @sds: Variable containing the statistics of the sched_domain
+ * under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ if (!sds->power_savings_balance)
+ return 0;
- sum_avg_load_per_task += cpu_avg_load_per_task(i);
- }
+ if (sds->this != sds->group_leader ||
+ sds->group_leader == sds->group_min)
+ return 0;
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above
- * domains. In the newly idle case, we will allow all the cpu's
- * to do the newly idle load balance.
- */
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu && balance) {
- *balance = 0;
- goto ret;
- }
+ *imbalance = sds->min_load_per_task;
+ sds->busiest = sds->group_min;
- total_load += avg_load;
- total_pwr += group->__cpu_power;
+ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ group_first_cpu(sds->group_leader);
+ }
- /* Adjust by relative CPU power of the group */
- avg_load = sg_div_cpu_power(group,
- avg_load * SCHED_LOAD_SCALE);
+ return 1;
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+ return;
+}
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of two tasks.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
- avg_load_per_task = sg_div_cpu_power(group,
- sum_avg_load_per_task * SCHED_LOAD_SCALE);
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+ return;
+}
+
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ int local_group, const struct cpumask *cpus,
+ int *balance, struct sg_lb_stats *sgs)
+{
+ unsigned long load, max_cpu_load, min_cpu_load;
+ int i;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ unsigned long sum_avg_load_per_task;
+ unsigned long avg_load_per_task;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- __group_imb = 1;
+ if (local_group)
+ balance_cpu = group_first_cpu(group);
- group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ /* Tally up the load of all CPUs in the group */
+ sum_avg_load_per_task = avg_load_per_task = 0;
+ max_cpu_load = 0;
+ min_cpu_load = ~0UL;
+ for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ if (*sd_idle && rq->nr_running)
+ *sd_idle = 0;
+
+ /* Bias balancing toward cpus of our domain */
if (local_group) {
- this_load = avg_load;
- this = group;
- this_nr_running = sum_nr_running;
- this_load_per_task = sum_weighted_load;
- } else if (avg_load > max_load &&
- (sum_nr_running > group_capacity || __group_imb)) {
- max_load = avg_load;
- busiest = group;
- busiest_nr_running = sum_nr_running;
- busiest_load_per_task = sum_weighted_load;
- group_imb = __group_imb;
+ if (idle_cpu(i) && !first_idle_cpu) {
+ first_idle_cpu = 1;
+ balance_cpu = i;
+ }
+
+ load = target_load(i, load_idx);
+ } else {
+ load = source_load(i, load_idx);
+ if (load > max_cpu_load)
+ max_cpu_load = load;
+ if (min_cpu_load > load)
+ min_cpu_load = load;
}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE ||
- !(sd->flags & SD_POWERSAVINGS_BALANCE))
- goto group_next;
+ sgs->group_load += load;
+ sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_weighted_load += weighted_cpuload(i);
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (this_nr_running >= group_capacity ||
- !this_nr_running))
- power_savings_balance = 0;
+ sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ }
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!power_savings_balance || sum_nr_running >= group_capacity
- || !sum_nr_running)
- goto group_next;
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
+ *balance = 0;
+ return;
+ }
- /*
- * Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sum_nr_running < min_nr_running) ||
- (sum_nr_running == min_nr_running &&
- cpumask_first(sched_group_cpus(group)) >
- cpumask_first(sched_group_cpus(group_min)))) {
- group_min = group;
- min_nr_running = sum_nr_running;
- min_load_per_task = sum_weighted_load /
- sum_nr_running;
- }
+ /* Adjust by relative CPU power of the group */
+ sgs->avg_load = sg_div_cpu_power(group,
+ sgs->group_load * SCHED_LOAD_SCALE);
- /*
- * Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
- */
- if (sum_nr_running <= group_capacity - 1) {
- if (sum_nr_running > leader_nr_running ||
- (sum_nr_running == leader_nr_running &&
- cpumask_first(sched_group_cpus(group)) <
- cpumask_first(sched_group_cpus(group_leader)))) {
- group_leader = group;
- leader_nr_running = sum_nr_running;
- }
+
+ /*
+ * Consider the group unbalanced when the imbalance is larger
+ * than the average weight of two tasks.
+ *
+ * APZ: with cgroup the avg task weight can vary wildly and
+ * might not be a suitable number - should we keep a
+ * normalized nr_running number somewhere that negates
+ * the hierarchy?
+ */
+ avg_load_per_task = sg_div_cpu_power(group,
+ sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ sgs->group_imb = 1;
+
+ sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+
+}
+
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+ enum cpu_idle_type idle, int *sd_idle,
+ const struct cpumask *cpus, int *balance,
+ struct sd_lb_stats *sds)
+{
+ struct sched_group *group = sd->groups;
+ struct sg_lb_stats sgs;
+ int load_idx;
+
+ init_sd_power_savings_stats(sd, sds, idle);
+ load_idx = get_sd_load_idx(sd, idle);
+
+ do {
+ int local_group;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
+ memset(&sgs, 0, sizeof(sgs));
+ update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+ local_group, cpus, balance, &sgs);
+
+ if (local_group && balance && !(*balance))
+ return;
+
+ sds->total_load += sgs.group_load;
+ sds->total_pwr += group->__cpu_power;
+
+ if (local_group) {
+ sds->this_load = sgs.avg_load;
+ sds->this = group;
+ sds->this_nr_running = sgs.sum_nr_running;
+ sds->this_load_per_task = sgs.sum_weighted_load;
+ } else if (sgs.avg_load > sds->max_load &&
+ (sgs.sum_nr_running > sgs.group_capacity ||
+ sgs.group_imb)) {
+ sds->max_load = sgs.avg_load;
+ sds->busiest = group;
+ sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->group_imb = sgs.group_imb;
}
-group_next:
-#endif
+
+ update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load || busiest_nr_running == 0)
- goto out_balanced;
-
- avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+}
- if (this_load >= avg_load ||
- 100*max_load <= sd->imbalance_pct*this_load)
- goto out_balanced;
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ * amongst the groups of a sched_domain, during
+ * load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned int imbn = 2;
+
+ if (sds->this_nr_running) {
+ sds->this_load_per_task /= sds->this_nr_running;
+ if (sds->busiest_load_per_task >
+ sds->this_load_per_task)
+ imbn = 1;
+ } else
+ sds->this_load_per_task =
+ cpu_avg_load_per_task(this_cpu);
- busiest_load_per_task /= busiest_nr_running;
- if (group_imb)
- busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+ sds->busiest_load_per_task * imbn) {
+ *imbalance = sds->busiest_load_per_task;
+ return;
+ }
/*
- * We're trying to get all the cpus to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded cpu below the average load, as either of these
- * actions would just result in more rebalancing later, and ping-pong
- * tasks around. Thus we look for the minimum possible imbalance.
- * Negative imbalances (*we* are more loaded than anyone else) will
- * be counted as no imbalance for these purposes -- we can't fix that
- * by pulling tasks to us. Be careful of negative numbers as they'll
- * appear as very large values with unsigned longs.
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU power used by
+ * moving them.
*/
- if (max_load <= busiest_load_per_task)
- goto out_balanced;
+ pwr_now += sds->busiest->__cpu_power *
+ min(sds->busiest_load_per_task, sds->max_load);
+ pwr_now += sds->this->__cpu_power *
+ min(sds->this_load_per_task, sds->this_load);
+ pwr_now /= SCHED_LOAD_SCALE;
+
+ /* Amount of load we'd subtract */
+ tmp = sg_div_cpu_power(sds->busiest,
+ sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ if (sds->max_load > tmp)
+ pwr_move += sds->busiest->__cpu_power *
+ min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+ /* Amount of load we'd add */
+ if (sds->max_load * sds->busiest->__cpu_power <
+ sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = sg_div_cpu_power(sds->this,
+ sds->max_load * sds->busiest->__cpu_power);
+ else
+ tmp = sg_div_cpu_power(sds->this,
+ sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += sds->this->__cpu_power *
+ min(sds->this_load_per_task, sds->this_load + tmp);
+ pwr_move /= SCHED_LOAD_SCALE;
+
+ /* Move if we gain throughput */
+ if (pwr_move > pwr_now)
+ *imbalance = sds->busiest_load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+ unsigned long *imbalance)
+{
+ unsigned long max_pull;
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
- if (max_load < avg_load) {
+ if (sds->max_load < sds->avg_load) {
*imbalance = 0;
- goto small_imbalance;
+ return fix_small_imbalance(sds, this_cpu, imbalance);
}
/* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ max_pull = min(sds->max_load - sds->avg_load,
+ sds->max_load - sds->busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->__cpu_power,
- (avg_load - this_load) * this->__cpu_power)
+ *imbalance = min(max_pull * sds->busiest->__cpu_power,
+ (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < busiest_load_per_task) {
- unsigned long tmp, pwr_now, pwr_move;
- unsigned int imbn;
-
-small_imbalance:
- pwr_move = pwr_now = 0;
- imbn = 2;
- if (this_nr_running) {
- this_load_per_task /= this_nr_running;
- if (busiest_load_per_task > this_load_per_task)
- imbn = 1;
- } else
- this_load_per_task = cpu_avg_load_per_task(this_cpu);
+ if (*imbalance < sds->busiest_load_per_task)
+ return fix_small_imbalance(sds, this_cpu, imbalance);
- if (max_load - this_load + busiest_load_per_task >=
- busiest_load_per_task * imbn) {
- *imbalance = busiest_load_per_task;
- return busiest;
- }
+}
+/******* find_busiest_group() helpers end here *********************/
- /*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
- * moving them.
- */
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ * be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ * is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns: - the busiest group if imbalance exists.
+ * - If no imbalance and user has opted for power-savings balance,
+ * return the least loaded group whose CPUs can be
+ * put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+ unsigned long *imbalance, enum cpu_idle_type idle,
+ int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+ struct sd_lb_stats sds;
- pwr_now += busiest->__cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->__cpu_power *
- min(this_load_per_task, this_load);
- pwr_now /= SCHED_LOAD_SCALE;
-
- /* Amount of load we'd subtract */
- tmp = sg_div_cpu_power(busiest,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- if (max_load > tmp)
- pwr_move += busiest->__cpu_power *
- min(busiest_load_per_task, max_load - tmp);
-
- /* Amount of load we'd add */
- if (max_load * busiest->__cpu_power <
- busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = sg_div_cpu_power(this,
- max_load * busiest->__cpu_power);
- else
- tmp = sg_div_cpu_power(this,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- pwr_move += this->__cpu_power *
- min(this_load_per_task, this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
+ memset(&sds, 0, sizeof(sds));
- /* Move if we gain throughput */
- if (pwr_move > pwr_now)
- *imbalance = busiest_load_per_task;
- }
+ /*
+ * Compute the various statistics relavent for load balancing at
+ * this level.
+ */
+ update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+ balance, &sds);
+
+ /* Cases where imbalance does not exist from POV of this_cpu */
+ /* 1) this_cpu is not the appropriate cpu to perform load balancing
+ * at this level.
+ * 2) There is no busy sibling group to pull from.
+ * 3) This group is the busiest group.
+ * 4) This group is more busy than the avg busieness at this
+ * sched_domain.
+ * 5) The imbalance is within the specified limit.
+ * 6) Any rebalance would lead to ping-pong
+ */
+ if (balance && !(*balance))
+ goto ret;
- return busiest;
+ if (!sds.busiest || sds.busiest_nr_running == 0)
+ goto out_balanced;
-out_balanced:
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- goto ret;
+ if (sds.this_load >= sds.max_load)
+ goto out_balanced;
- if (this == group_leader && group_leader != group_min) {
- *imbalance = min_load_per_task;
- if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
- cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- cpumask_first(sched_group_cpus(group_leader));
- }
- return group_min;
- }
-#endif
+ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
+ if (sds.this_load >= sds.avg_load)
+ goto out_balanced;
+
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
+
+ sds.busiest_load_per_task /= sds.busiest_nr_running;
+ if (sds.group_imb)
+ sds.busiest_load_per_task =
+ min(sds.busiest_load_per_task, sds.avg_load);
+
+ /*
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load, as either of these
+ * actions would just result in more rebalancing later, and ping-pong
+ * tasks around. Thus we look for the minimum possible imbalance.
+ * Negative imbalances (*we* are more loaded than anyone else) will
+ * be counted as no imbalance for these purposes -- we can't fix that
+ * by pulling tasks to us. Be careful of negative numbers as they'll
+ * appear as very large values with unsigned longs.
+ */
+ if (sds.max_load <= sds.busiest_load_per_task)
+ goto out_balanced;
+
+ /* Looks like there is an imbalance. Compute it */
+ calculate_imbalance(&sds, this_cpu, imbalance);
+ return sds.busiest;
+
+out_balanced:
+ /*
+ * There is no obvious imbalance. But check if we can do some balancing
+ * to save power.
+ */
+ if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+ return sds.busiest;
ret:
*imbalance = 0;
return NULL;
#endif
}
+static inline int on_null_domain(int cpu)
+{
+ return !rcu_dereference(cpu_rq(cpu)->sd);
+}
+
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*
cpumask_test_cpu(cpu, nohz.cpu_mask))
return;
#endif
- if (time_after_eq(jiffies, rq->next_balance))
+ /* Don't need to rebalance while attached to NULL domain */
+ if (time_after_eq(jiffies, rq->next_balance) &&
+ likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
}
#endif
}
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ if (prev->state == TASK_RUNNING) {
+ u64 runtime = prev->se.sum_exec_runtime;
+
+ runtime -= prev->se.prev_sum_exec_runtime;
+ runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+
+ /*
+ * In order to avoid avg_overlap growing stale when we are
+ * indeed overlapping and hence not getting put to sleep, grow
+ * the avg_overlap on preemption.
+ *
+ * We use the average preemption runtime because that
+ * correlates to the amount of cache footprint a task can
+ * build up.
+ */
+ update_avg(&prev->se.avg_overlap, runtime);
+ }
+ prev->sched_class->put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
- prev->sched_class->put_prev_task(rq, prev);
- next = pick_next_task(rq, prev);
+ put_prev_task(rq, prev);
+ next = pick_next_task(rq);
if (likely(prev != next)) {
sched_info_switch(prev, next);
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
#endif /* CONFIG_PREEMPT */
if (increment > 40)
increment = 40;
- nice = PRIO_TO_NICE(current->static_prio) + increment;
+ nice = TASK_NICE(current) + increment;
if (nice < -20)
nice = -20;
if (nice > 19)
printk(KERN_CONT " %016lx ", thread_saved_pc(p));
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
- {
- unsigned long *n = end_of_stack(p);
- while (!*n)
- n++;
- free = (unsigned long)n - (unsigned long)end_of_stack(p);
- }
+ free = stack_not_used(p);
#endif
printk(KERN_CONT "%5lu %5d %6d\n", free,
task_pid_nr(p), task_pid_nr(p->real_parent));
if (!rq->nr_running)
break;
update_rq_clock(rq);
- next = pick_next_task(rq, rq->curr);
+ next = pick_next_task(rq);
if (!next)
break;
next->sched_class->put_prev_task(rq, next);
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- rt_rq->highest_prio = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
#endif
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
+ plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
return ret;
}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
#else /* !CONFIG_RT_GROUP_SCHED */
static int sched_rt_global_constraints(void)
{
struct task_struct *tsk)
{
#ifdef CONFIG_RT_GROUP_SCHED
- /* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+ if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
#ifndef CONFIG_64BIT
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
#ifndef CONFIG_64BIT
/*
struct cpuacct *ca;
int cpu;
- if (!cpuacct_subsys.active)
+ if (unlikely(!cpuacct_subsys.active))
return;
cpu = task_cpu(tsk);
ca = task_ca(tsk);
for (; ca; ca = ca->parent) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
}
* policies)
*/
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ return rt_se->rt_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p = rt_task_of(rt_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->rt;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
#ifdef CONFIG_SMP
static inline int rt_overloaded(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rq *rq)
+static void update_rt_migration(struct rt_rq *rt_rq)
{
- if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
- if (!rq->rt.overloaded) {
- rt_set_overload(rq);
- rq->rt.overloaded = 1;
+ if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+ if (!rt_rq->overloaded) {
+ rt_set_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 1;
}
- } else if (rq->rt.overloaded) {
- rt_clear_overload(rq);
- rq->rt.overloaded = 0;
+ } else if (rt_rq->overloaded) {
+ rt_clear_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 0;
}
}
-#endif /* CONFIG_SMP */
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory++;
+
+ update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory--;
+
+ update_rt_migration(rt_rq);
+}
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_node_init(&p->pushable_tasks, p->prio);
+ plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+#else
+
+static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
- return container_of(rt_se, struct task_struct, rt);
}
+static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+#endif /* CONFIG_SMP */
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
- return rt_rq->rq;
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- return rt_se->rt_rq;
-}
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se);
- if (rt_rq->highest_prio < curr->prio)
+ if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
- return container_of(rt_rq, struct rq, rt);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
-
- return &rq->rt;
-}
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
- return rt_rq->highest_prio;
+ return rt_rq->highest_prio.curr;
#endif
return rt_task_of(rt_se)->prio;
}
}
-static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+#if defined CONFIG_SMP
+
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+
+static inline int next_prio(struct rq *rq)
{
- WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
-#ifdef CONFIG_SMP
- struct rq *rq = rq_of_rt_rq(rt_rq);
-#endif
+ struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+
+ if (next && rt_prio(next->prio))
+ return next->prio;
+ else
+ return MAX_RT_PRIO;
+}
+
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (prio < prev_prio) {
+
+ /*
+ * If the new task is higher in priority than anything on the
+ * run-queue, we know that the previous high becomes our
+ * next-highest.
+ */
+ rt_rq->highest_prio.next = prev_prio;
- rt_rq->highest_prio = rt_se_prio(rt_se);
-#ifdef CONFIG_SMP
if (rq->online)
- cpupri_set(&rq->rd->cpupri, rq->cpu,
- rt_se_prio(rt_se));
-#endif
- }
-#endif
-#ifdef CONFIG_SMP
- if (rt_se->nr_cpus_allowed > 1) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
- rq->rt.rt_nr_migratory++;
- }
+ } else if (prio == rt_rq->highest_prio.curr)
+ /*
+ * If the next task is equal in priority to the highest on
+ * the run-queue, then we implicitly know that the next highest
+ * task cannot be any lower than current
+ */
+ rt_rq->highest_prio.next = prio;
+ else if (prio < rt_rq->highest_prio.next)
+ /*
+ * Otherwise, we need to recompute next-highest
+ */
+ rt_rq->highest_prio.next = next_prio(rq);
+}
- update_rt_migration(rq_of_rt_rq(rt_rq));
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- if (rt_se_boosted(rt_se))
- rt_rq->rt_nr_boosted++;
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
- if (rt_rq->tg)
- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-#else
- start_rt_bandwidth(&def_rt_bandwidth);
-#endif
+ if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+ rt_rq->highest_prio.next = next_prio(rq);
+
+ if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
+#else /* CONFIG_SMP */
+
static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-#ifdef CONFIG_SMP
- int highest_prio = rt_rq->highest_prio;
-#endif
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
- WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
+ if (prio < prev_prio)
+ rt_rq->highest_prio.curr = prio;
+
+ inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
if (rt_rq->rt_nr_running) {
- struct rt_prio_array *array;
- WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
- if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
- /* recalculate */
- array = &rt_rq->active;
- rt_rq->highest_prio =
+ WARN_ON(prio < prev_prio);
+
+ /*
+ * This may have been our highest task, and therefore
+ * we may have some recomputation to do
+ */
+ if (prio == prev_prio) {
+ struct rt_prio_array *array = &rt_rq->active;
+
+ rt_rq->highest_prio.curr =
sched_find_first_bit(array->bitmap);
- } /* otherwise leave rq->highest prio alone */
+ }
+
} else
- rt_rq->highest_prio = MAX_RT_PRIO;
-#endif
-#ifdef CONFIG_SMP
- if (rt_se->nr_cpus_allowed > 1) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
- rq->rt.rt_nr_migratory--;
- }
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
- if (rt_rq->highest_prio != highest_prio) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
- if (rq->online)
- cpupri_set(&rq->rd->cpupri, rq->cpu,
- rt_rq->highest_prio);
- }
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
- update_rt_migration(rq_of_rt_rq(rt_rq));
-#endif /* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+
+ if (rt_rq->tg)
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-#endif
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ int prio = rt_se_prio(rt_se);
+
+ WARN_ON(!rt_prio(prio));
+ rt_rq->rt_nr_running++;
+
+ inc_rt_prio(rt_rq, prio);
+ inc_rt_migration(rt_se, rt_rq);
+ inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ WARN_ON(!rt_rq->rt_nr_running);
+ rt_rq->rt_nr_running--;
+
+ dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ dec_rt_migration(rt_se, rt_rq);
+ dec_rt_group(rt_se, rt_rq);
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
enqueue_rt_entity(rt_se);
+ if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
+
inc_cpu_load(rq, p->se.load.weight);
}
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
+ dequeue_pushable_task(rq, p);
+
dec_cpu_load(rq, p->se.load.weight);
}
return next;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
p = rt_task_of(rt_se);
p->se.exec_start = rq->clock;
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = _pick_next_task_rt(rq);
+
+ /* The running task is never eligible for pushing */
+ if (p)
+ dequeue_pushable_task(rq, p);
+
return p;
}
{
update_curr_rt(rq);
p->se.exec_start = 0;
+
+ /*
+ * The previous task needs to be made eligible for pushing
+ * if it is still active
+ */
+ if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
}
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
- static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+ static inline int pick_optimal_cpu(int this_cpu,
+ const struct cpumask *mask)
{
int first;
/* "this_cpu" is cheaper to preempt than a remote processor */
- if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+ if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
return this_cpu;
first = cpumask_first(mask);
struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ cpumask_var_t domain_mask;
if (task->rt.nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (this_cpu == cpu)
this_cpu = -1; /* Skip this_cpu opt if the same */
- for_each_domain(cpu, sd) {
- if (sd->flags & SD_WAKE_AFFINE) {
- cpumask_t domain_mask;
- int best_cpu;
+ if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
- cpumask_and(&domain_mask, sched_domain_span(sd),
- lowest_mask);
+ cpumask_and(domain_mask,
+ sched_domain_span(sd),
+ lowest_mask);
- best_cpu = pick_optimal_cpu(this_cpu,
- &domain_mask);
- if (best_cpu != -1)
- return best_cpu;
+ best_cpu = pick_optimal_cpu(this_cpu,
+ domain_mask);
+
+ if (best_cpu != -1) {
+ free_cpumask_var(domain_mask);
+ return best_cpu;
+ }
+ }
}
+ free_cpumask_var(domain_mask);
}
/*
}
/* If this rq is still suitable use it. */
- if (lowest_rq->rt.highest_prio > task->prio)
+ if (lowest_rq->rt.highest_prio.curr > task->prio)
break;
/* try again */
return lowest_rq;
}
+static inline int has_pushable_tasks(struct rq *rq)
+{
+ return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->rt.nr_cpus_allowed <= 1);
+
+ BUG_ON(!p->se.on_rq);
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
{
struct task_struct *next_task;
struct rq *lowest_rq;
- int ret = 0;
- int paranoid = RT_MAX_TRIES;
if (!rq->rt.overloaded)
return 0;
- next_task = pick_next_highest_task_rt(rq, -1);
+ next_task = pick_next_pushable_task(rq);
if (!next_task)
return 0;
struct task_struct *task;
/*
* find lock_lowest_rq releases rq->lock
- * so it is possible that next_task has changed.
- * If it has, then try again.
+ * so it is possible that next_task has migrated.
+ *
+ * We need to make sure that the task is still on the same
+ * run-queue and is also still the next task eligible for
+ * pushing.
*/
- task = pick_next_highest_task_rt(rq, -1);
- if (unlikely(task != next_task) && task && paranoid--) {
- put_task_struct(next_task);
- next_task = task;
- goto retry;
+ task = pick_next_pushable_task(rq);
+ if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ /*
+ * If we get here, the task hasnt moved at all, but
+ * it has failed to push. We will not try again,
+ * since the other cpus will pull from us when they
+ * are ready.
+ */
+ dequeue_pushable_task(rq, next_task);
+ goto out;
}
- goto out;
+
+ if (!task)
+ /* No more tasks, just exit */
+ goto out;
+
+ /*
+ * Something has shifted, try again.
+ */
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
}
deactivate_task(rq, next_task, 0);
double_unlock_balance(rq, lowest_rq);
- ret = 1;
out:
put_task_struct(next_task);
- return ret;
+ return 1;
}
-/*
- * TODO: Currently we just use the second highest prio task on
- * the queue, and stop when it can't migrate (or there's
- * no more RT tasks). There may be a case where a lower
- * priority RT task has a different affinity than the
- * higher RT task. In this case the lower RT task could
- * possibly be able to migrate where as the higher priority
- * RT task could not. We currently ignore this issue.
- * Enhancements are welcome!
- */
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
- struct task_struct *p, *next;
+ struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
return 0;
- next = pick_next_task_rt(this_rq);
-
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
src_rq = cpu_rq(cpu);
+
+ /*
+ * Don't bother taking the src_rq->lock if the next highest
+ * task is known to be lower-priority than our current task.
+ * This may look racy, but if this value is about to go
+ * logically higher, the src_rq will push this task away.
+ * And if its going logically lower, we do not care
+ */
+ if (src_rq->rt.highest_prio.next >=
+ this_rq->rt.highest_prio.curr)
+ continue;
+
/*
* We can potentially drop this_rq's lock in
* double_lock_balance, and another CPU could
- * steal our next task - hence we must cause
- * the caller to recalculate the next task
- * in that case:
+ * alter this_rq
*/
- if (double_lock_balance(this_rq, src_rq)) {
- struct task_struct *old_next = next;
-
- next = pick_next_task_rt(this_rq);
- if (next != old_next)
- ret = 1;
- }
+ double_lock_balance(this_rq, src_rq);
/*
* Are there still pullable RT tasks?
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
- if (p && (!next || (p->prio < next->prio))) {
+ if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->se.on_rq);
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
- * current task on the run queue or
- * this_rq next task is lower in prio than
- * the current task on that rq.
+ * current task on the run queue
*/
- if (p->prio < src_rq->curr->prio ||
- (next && next->prio < src_rq->curr->prio))
+ if (p->prio < src_rq->curr->prio)
goto skip;
ret = 1;
* case there's an even higher prio task
* in another runqueue. (low likelyhood
* but possible)
- *
- * Update next so that we won't pick a task
- * on another cpu with a priority lower (or equal)
- * than the one we just picked.
*/
- next = p;
-
}
skip:
double_unlock_balance(this_rq, src_rq);
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+ if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
pull_rt_task(rq);
}
+/*
+ * assumes rq->lock is held
+ */
+static int needs_post_schedule_rt(struct rq *rq)
+{
+ return has_pushable_tasks(rq);
+}
+
static void post_schedule_rt(struct rq *rq)
{
/*
- * If we have more than one rt_task queued, then
- * see if we can push the other rt_tasks off to other CPUS.
- * Note we may release the rq lock, and since
- * the lock was owned by prev, we need to release it
- * first via finish_lock_switch and then reaquire it here.
+ * This is only called if needs_post_schedule_rt() indicates that
+ * we need to push tasks away
*/
- if (unlikely(rq->rt.overloaded)) {
- spin_lock_irq(&rq->lock);
- push_rt_tasks(rq);
- spin_unlock_irq(&rq->lock);
- }
+ spin_lock_irq(&rq->lock);
+ push_rt_tasks(rq);
+ spin_unlock_irq(&rq->lock);
}
/*
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- rq->rt.overloaded)
+ has_pushable_tasks(rq) &&
+ p->rt.nr_cpus_allowed > 1)
push_rt_tasks(rq);
}
if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
+ if (!task_current(rq, p)) {
+ /*
+ * Make sure we dequeue this task from the pushable list
+ * before going further. It will either remain off of
+ * the list because we are no longer pushable, or it
+ * will be requeued.
+ */
+ if (p->rt.nr_cpus_allowed > 1)
+ dequeue_pushable_task(rq, p);
+
+ /*
+ * Requeue if our weight is changing and still > 1
+ */
+ if (weight > 1)
+ enqueue_pushable_task(rq, p);
+
+ }
+
if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
rq->rt.rt_nr_migratory++;
} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
rq->rt.rt_nr_migratory--;
}
- update_rt_migration(rq);
+ update_rt_migration(&rq->rt);
}
cpumask_copy(&p->cpus_allowed, new_mask);
__enable_runtime(rq);
- cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
}
/* Assumes rq->lock is held */
* can release the rq lock and p could migrate.
* Only reschedule if p is still on the same runqueue.
*/
- if (p->prio > rq->rt.highest_prio && rq->curr == p)
+ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
resched_task(p);
#else
/* For UP simply resched on drop of prio */
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
}
static const struct sched_class rt_sched_class = {
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
+ .needs_post_schedule = needs_post_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt,
preempt_enable_no_resched();
cond_resched();
preempt_disable();
+ rcu_qsctr_inc((long)__bind_cpu);
}
preempt_enable();
set_current_state(TASK_INTERRUPTIBLE);
return 0;
}
+ int __init __weak arch_probe_nr_irqs(void)
+ {
+ return 0;
+ }
+
int __init __weak arch_early_irq_init(void)
{
return 0;
config LZO_DECOMPRESS
tristate
+ #
+ # These all provide a common interface (hence the apparent duplication with
+ # ZLIB_INFLATE; DECOMPRESS_GZIP is just a wrapper.)
+ #
+ config DECOMPRESS_GZIP
+ select ZLIB_INFLATE
+ tristate
+
+ config DECOMPRESS_BZIP2
+ tristate
+
+ config DECOMPRESS_LZMA
+ tristate
+
#
# Generic allocator support is selected if needed
#
config TEXTSEARCH_FSM
tristate
-#
-# plist support is select#ed if needed
-#
-config PLIST
- boolean
-
config HAS_IOMEM
boolean
depends on !NO_IOMEM
bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
depends on EXPERIMENTAL && BROKEN
+#
+# Netlink attribute parsing support is select'ed if needed
+#
+config NLATTR
+ bool
+
endmenu
idr.o int_sqrt.o extable.o prio_tree.o \
sha1.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o prio_heap.o ratelimit.o show_mem.o \
- is_single_threaded.o plist.o
- is_single_threaded.o decompress.o
++ is_single_threaded.o plist.o decompress.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
lib-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
-obj-$(CONFIG_PLIST) += plist.o
obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
obj-$(CONFIG_DEBUG_LIST) += list_debug.o
obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
obj-$(CONFIG_LZO_COMPRESS) += lzo/
obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
+ lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
+ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
+ lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
+
obj-$(CONFIG_TEXTSEARCH) += textsearch.o
obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
-obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o
+obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
+
+obj-$(CONFIG_NLATTR) += nlattr.o
hostprogs-y := gen_crc32table
clean-files := crc32table.h
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
- sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
int proto;
int id;
- if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ iph = skb_gro_header(skb, sizeof(*iph));
+ if (unlikely(!iph))
goto out;
- iph = ip_hdr(skb);
proto = iph->protocol & (MAX_INET_PROTOS - 1);
rcu_read_lock();
if (!ops || !ops->gro_receive)
goto out_unlock;
- if (iph->version != 4 || iph->ihl != 5)
+ if (*(u8 *)iph != 0x45)
goto out_unlock;
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto out_unlock;
- flush = ntohs(iph->tot_len) != skb->len ||
+ flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
iph->frag_off != htons(IP_DF);
id = ntohs(iph->id);
iph2 = ip_hdr(p);
- if (iph->protocol != iph2->protocol ||
- iph->tos != iph2->tos ||
- memcmp(&iph->saddr, &iph2->saddr, 8)) {
+ if ((iph->protocol ^ iph2->protocol) |
+ (iph->tos ^ iph2->tos) |
+ (iph->saddr ^ iph2->saddr) |
+ (iph->daddr ^ iph2->daddr)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
/* All fields must match except length and checksum. */
NAPI_GRO_CB(p)->flush |=
- memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
- (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
+ (iph->ttl ^ iph2->ttl) |
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
}
NAPI_GRO_CB(skb)->flush |= flush;
- __skb_pull(skb, sizeof(*iph));
- skb_reset_transport_header(skb);
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
pp = ops->gro_receive(head, skb);
int snmp_mib_init(void *ptr[2], size_t mibsize)
{
BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize);
+ ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
if (!ptr[0])
goto err0;
- ptr[1] = __alloc_percpu(mibsize);
+ ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
if (!ptr[1])
goto err1;
return 0;
* IP protocol layer initialiser
*/
-static struct packet_type ip_packet_type = {
- .type = __constant_htons(ETH_P_IP),
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
- .protocol = __constant_htons(ETH_P_IP),
+ .protocol = cpu_to_be16(ETH_P_IP),
.gc = rt_garbage_collect,
.check = ipv4_dst_check,
.destroy = ipv4_dst_destroy,
static struct dst_ops ipv4_dst_blackhole_ops = {
.family = AF_INET,
- .protocol = __constant_htons(ETH_P_IP),
+ .protocol = cpu_to_be16(ETH_P_IP),
.destroy = ipv4_dst_destroy,
.check = ipv4_dst_check,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
return ip_route_output_flow(net, rp, flp, NULL, 0);
}
-static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+static int rt_fill_info(struct net *net,
+ struct sk_buff *skb, u32 pid, u32 seq, int event,
int nowait, unsigned int flags)
{
struct rtable *rt = skb->rtable;
__be32 dst = rt->rt_dst;
if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
- IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
- int err = ipmr_get_route(skb, r, nowait);
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb, r, nowait);
if (err <= 0) {
if (!nowait) {
if (err == 0)
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+ err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
RTM_NEWROUTE, 0, 0);
if (err <= 0)
goto errout_free;
if (rt_is_expired(rt))
continue;
skb->dst = dst_clone(&rt->u.dst);
- if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1, NLM_F_MULTI) <= 0) {
dst_release(xchg(&skb->dst, NULL));
int rc = 0;
#ifdef CONFIG_NET_CLS_ROUTE
- ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
+ ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
#endif
-D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
#hash values
-ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+ifdef CONFIG_DYNAMIC_DEBUG
debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\
-D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))"
else
cmd_gzip = gzip -f -9 < $< > $@
+ # Bzip2
+ # ---------------------------------------------------------------------------
+
+ # Bzip2 does not include size in file... so we have to fake that
+ size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
+
+ quiet_cmd_bzip2 = BZIP2 $@
+ cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false)
+
+ # Lzma
+ # ---------------------------------------------------------------------------
+
+ quiet_cmd_lzma = LZMA $@
+ cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false)