CONFIG_VLAN_8021Q=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_CMA=y
++CONFIG_DMA_CMA=y
CONFIG_MTD=y
CONFIG_MTD_CMDLINE_PARTS=y
CONFIG_MTD_BLOCK=y
CONFIG_MAC80211_RC_DEFAULT_PID=y
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_CMA=y
++CONFIG_DMA_CMA=y
CONFIG_CONNECTOR=y
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEVTMPFS_MOUNT=y
# CONFIG_FIRMWARE_IN_KERNEL is not set
CONFIG_CMA=y
++CONFIG_DMA_CMA=y
CONFIG_MTD=y
CONFIG_MTD_M25P80=y
CONFIG_PROC_DEVICETREE=y
DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar));
DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr));
DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar));
+ DEFINE(THREAD_TM_TAR, offsetof(struct thread_struct, tm_tar));
+ DEFINE(THREAD_TM_PPR, offsetof(struct thread_struct, tm_ppr));
+ DEFINE(THREAD_TM_DSCR, offsetof(struct thread_struct, tm_dscr));
DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs));
DEFINE(THREAD_TRANSACT_VR0, offsetof(struct thread_struct,
transact_vr[0]));
DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
#endif
+ DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
#ifdef CONFIG_64BIT
mm->context.asce_bits |= _ASCE_TYPE_REGION3;
#endif
- if (current->mm && current->mm->context.alloc_pgste) {
- /*
- * alloc_pgste indicates, that any NEW context will be created
- * with extended page tables. The old context is unchanged. The
- * page table allocation and the page table operations will
- * look at has_pgste to distinguish normal and extended page
- * tables. The only way to create extended page tables is to
- * set alloc_pgste and then create a new context (e.g. dup_mm).
- * The page table allocation is called after init_new_context
- * and if has_pgste is set, it will create extended page
- * tables.
- */
- mm->context.has_pgste = 1;
- mm->context.alloc_pgste = 1;
- } else {
- mm->context.has_pgste = 0;
- mm->context.alloc_pgste = 0;
- }
+ mm->context.has_pgste = 0;
mm->context.asce_limit = STACK_TOP_MAX;
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
return 0;
WARN_ON(atomic_read(&prev->context.attach_count) < 0);
atomic_inc(&next->context.attach_count);
/* Check for TLBs not flushed yet */
- if (next->context.flush_mm)
- __tlb_flush_mm(next);
+ __tlb_flush_mm_lazy(next);
}
#define enter_lazy_tlb(mm,tsk) do { } while (0)
/* Hardware bits in the page table entry */
#define _PAGE_CO 0x100 /* HW Change-bit override */
-#define _PAGE_RO 0x200 /* HW read-only bit */
+#define _PAGE_PROTECT 0x200 /* HW read-only bit */
#define _PAGE_INVALID 0x400 /* HW invalid bit */
+#define _PAGE_LARGE 0x800 /* Bit to mark a large pte */
/* Software bits in the page table entry */
-#define _PAGE_SWT 0x001 /* SW pte type bit t */
-#define _PAGE_SWX 0x002 /* SW pte type bit x */
-#define _PAGE_SWC 0x004 /* SW pte changed bit */
-#define _PAGE_SWR 0x008 /* SW pte referenced bit */
-#define _PAGE_SWW 0x010 /* SW pte write bit */
-#define _PAGE_SPECIAL 0x020 /* SW associated with special page */
+#define _PAGE_PRESENT 0x001 /* SW pte present bit */
+#define _PAGE_TYPE 0x002 /* SW pte type bit */
+#define _PAGE_YOUNG 0x004 /* SW pte young bit */
+#define _PAGE_DIRTY 0x008 /* SW pte dirty bit */
+#define _PAGE_READ 0x010 /* SW pte read bit */
+#define _PAGE_WRITE 0x020 /* SW pte write bit */
+#define _PAGE_SPECIAL 0x040 /* SW associated with special page */
#define __HAVE_ARCH_PTE_SPECIAL
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_CO | \
- _PAGE_SWC | _PAGE_SWR)
-
-/* Six different types of pages. */
-#define _PAGE_TYPE_EMPTY 0x400
-#define _PAGE_TYPE_NONE 0x401
-#define _PAGE_TYPE_SWAP 0x403
-#define _PAGE_TYPE_FILE 0x601 /* bit 0x002 is used for offset !! */
-#define _PAGE_TYPE_RO 0x200
-#define _PAGE_TYPE_RW 0x000
-
-/*
- * Only four types for huge pages, using the invalid bit and protection bit
- * of a segment table entry.
- */
-#define _HPAGE_TYPE_EMPTY 0x020 /* _SEGMENT_ENTRY_INV */
-#define _HPAGE_TYPE_NONE 0x220
-#define _HPAGE_TYPE_RO 0x200 /* _SEGMENT_ENTRY_RO */
-#define _HPAGE_TYPE_RW 0x000
+ _PAGE_DIRTY | _PAGE_YOUNG)
/*
- * PTE type bits are rather complicated. handle_pte_fault uses pte_present,
- * pte_none and pte_file to find out the pte type WITHOUT holding the page
- * table lock. ptep_clear_flush on the other hand uses ptep_clear_flush to
- * invalidate a given pte. ipte sets the hw invalid bit and clears all tlbs
- * for the page. The page table entry is set to _PAGE_TYPE_EMPTY afterwards.
- * This change is done while holding the lock, but the intermediate step
- * of a previously valid pte with the hw invalid bit set can be observed by
- * handle_pte_fault. That makes it necessary that all valid pte types with
- * the hw invalid bit set must be distinguishable from the four pte types
- * empty, none, swap and file.
+ * handle_pte_fault uses pte_present, pte_none and pte_file to find out the
+ * pte type WITHOUT holding the page table lock. The _PAGE_PRESENT bit
+ * is used to distinguish present from not-present ptes. It is changed only
+ * with the page table lock held.
+ *
+ * The following table gives the different possible bit combinations for
+ * the pte hardware and software bits in the last 12 bits of a pte:
*
- * irxt ipte irxt
- * _PAGE_TYPE_EMPTY 1000 -> 1000
- * _PAGE_TYPE_NONE 1001 -> 1001
- * _PAGE_TYPE_SWAP 1011 -> 1011
- * _PAGE_TYPE_FILE 11?1 -> 11?1
- * _PAGE_TYPE_RO 0100 -> 1100
- * _PAGE_TYPE_RW 0000 -> 1000
+ * 842100000000
+ * 000084210000
+ * 000000008421
+ * .IR...wrdytp
+ * empty .10...000000
+ * swap .10...xxxx10
+ * file .11...xxxxx0
+ * prot-none, clean, old .11...000001
+ * prot-none, clean, young .11...000101
+ * prot-none, dirty, old .10...001001
+ * prot-none, dirty, young .10...001101
+ * read-only, clean, old .11...010001
+ * read-only, clean, young .01...010101
+ * read-only, dirty, old .11...011001
+ * read-only, dirty, young .01...011101
+ * read-write, clean, old .11...110001
+ * read-write, clean, young .01...110101
+ * read-write, dirty, old .10...111001
+ * read-write, dirty, young .00...111101
*
- * pte_none is true for bits combinations 1000, 1010, 1100, 1110
- * pte_present is true for bits combinations 0000, 0010, 0100, 0110, 1001
- * pte_file is true for bits combinations 1101, 1111
- * swap pte is 1011 and 0001, 0011, 0101, 0111 are invalid.
+ * pte_present is true for the bit pattern .xx...xxxxx1, (pte & 0x001) == 0x001
+ * pte_none is true for the bit pattern .10...xxxx00, (pte & 0x603) == 0x400
+ * pte_file is true for the bit pattern .11...xxxxx0, (pte & 0x601) == 0x600
+ * pte_swap is true for the bit pattern .10...xxxx10, (pte & 0x603) == 0x402
*/
#ifndef CONFIG_64BIT
#define _ASCE_TABLE_LENGTH 0x7f /* 128 x 64 entries = 8k */
/* Bits in the segment table entry */
+#define _SEGMENT_ENTRY_BITS 0x7fffffffUL /* Valid segment table bits */
#define _SEGMENT_ENTRY_ORIGIN 0x7fffffc0UL /* page table origin */
-#define _SEGMENT_ENTRY_RO 0x200 /* page protection bit */
-#define _SEGMENT_ENTRY_INV 0x20 /* invalid segment table entry */
+#define _SEGMENT_ENTRY_PROTECT 0x200 /* page protection bit */
+#define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */
#define _SEGMENT_ENTRY_COMMON 0x10 /* common segment bit */
#define _SEGMENT_ENTRY_PTL 0x0f /* page table length */
+#define _SEGMENT_ENTRY_NONE _SEGMENT_ENTRY_PROTECT
#define _SEGMENT_ENTRY (_SEGMENT_ENTRY_PTL)
-#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV)
+#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID)
+
+/*
+ * Segment table entry encoding (I = invalid, R = read-only bit):
+ * ..R...I.....
+ * prot-none ..1...1.....
+ * read-only ..1...0.....
+ * read-write ..0...0.....
+ * empty ..0...1.....
+ */
/* Page status table bits for virtualization */
#define PGSTE_ACC_BITS 0xf0000000UL
#define PGSTE_HC_BIT 0x00200000UL
#define PGSTE_GR_BIT 0x00040000UL
#define PGSTE_GC_BIT 0x00020000UL
-#define PGSTE_UR_BIT 0x00008000UL
-#define PGSTE_UC_BIT 0x00004000UL /* user dirty (migration) */
-#define PGSTE_IN_BIT 0x00002000UL /* IPTE notify bit */
+#define PGSTE_IN_BIT 0x00008000UL /* IPTE notify bit */
#else /* CONFIG_64BIT */
/* Bits in the region table entry */
#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */
-#define _REGION_ENTRY_RO 0x200 /* region protection bit */
-#define _REGION_ENTRY_INV 0x20 /* invalid region table entry */
+#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */
+#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */
#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */
#define _REGION_ENTRY_TYPE_R2 0x08 /* region second table type */
#define _REGION_ENTRY_LENGTH 0x03 /* region third length */
#define _REGION1_ENTRY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_LENGTH)
-#define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INV)
+#define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID)
#define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH)
-#define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INV)
+#define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID)
#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH)
-#define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INV)
+#define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID)
#define _REGION3_ENTRY_LARGE 0x400 /* RTTE-format control, large page */
#define _REGION3_ENTRY_RO 0x200 /* page protection bit */
#define _REGION3_ENTRY_CO 0x100 /* change-recording override */
/* Bits in the segment table entry */
+#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
+#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff1ff33UL
#define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */
#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* segment table origin */
-#define _SEGMENT_ENTRY_RO 0x200 /* page protection bit */
-#define _SEGMENT_ENTRY_INV 0x20 /* invalid segment table entry */
+#define _SEGMENT_ENTRY_PROTECT 0x200 /* page protection bit */
+#define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */
#define _SEGMENT_ENTRY (0)
-#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV)
+#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID)
#define _SEGMENT_ENTRY_LARGE 0x400 /* STE-format control, large page */
#define _SEGMENT_ENTRY_CO 0x100 /* change-recording override */
+#define _SEGMENT_ENTRY_SPLIT 0x001 /* THP splitting bit */
+#define _SEGMENT_ENTRY_YOUNG 0x002 /* SW segment young bit */
+#define _SEGMENT_ENTRY_NONE _SEGMENT_ENTRY_YOUNG
+
+/*
+ * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
+ * ..R...I...y.
+ * prot-none, old ..0...1...1.
+ * prot-none, young ..1...1...1.
+ * read-only, old ..1...1...0.
+ * read-only, young ..1...0...1.
+ * read-write, old ..0...1...0.
+ * read-write, young ..0...0...1.
+ * The segment table origin is used to distinguish empty (origin==0) from
+ * read-write, old segment table entries (origin!=0)
+ */
+
#define _SEGMENT_ENTRY_SPLIT_BIT 0 /* THP splitting bit number */
-#define _SEGMENT_ENTRY_SPLIT (1UL << _SEGMENT_ENTRY_SPLIT_BIT)
/* Set of bits not changed in pmd_modify */
#define _SEGMENT_CHG_MASK (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \
#define PGSTE_HC_BIT 0x0020000000000000UL
#define PGSTE_GR_BIT 0x0004000000000000UL
#define PGSTE_GC_BIT 0x0002000000000000UL
-#define PGSTE_UR_BIT 0x0000800000000000UL
-#define PGSTE_UC_BIT 0x0000400000000000UL /* user dirty (migration) */
-#define PGSTE_IN_BIT 0x0000200000000000UL /* IPTE notify bit */
+#define PGSTE_IN_BIT 0x0000800000000000UL /* IPTE notify bit */
#endif /* CONFIG_64BIT */
/*
* Page protection definitions.
*/
-#define PAGE_NONE __pgprot(_PAGE_TYPE_NONE)
-#define PAGE_RO __pgprot(_PAGE_TYPE_RO)
-#define PAGE_RW __pgprot(_PAGE_TYPE_RO | _PAGE_SWW)
-#define PAGE_RWC __pgprot(_PAGE_TYPE_RW | _PAGE_SWW | _PAGE_SWC)
-
-#define PAGE_KERNEL PAGE_RWC
-#define PAGE_SHARED PAGE_KERNEL
-#define PAGE_COPY PAGE_RO
+#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_INVALID)
+#define PAGE_READ __pgprot(_PAGE_PRESENT | _PAGE_READ | \
+ _PAGE_INVALID | _PAGE_PROTECT)
+#define PAGE_WRITE __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ _PAGE_INVALID | _PAGE_PROTECT)
+
+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ _PAGE_YOUNG | _PAGE_DIRTY)
+#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ _PAGE_YOUNG | _PAGE_DIRTY)
+#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_YOUNG | \
+ _PAGE_PROTECT)
/*
* On s390 the page table entry has an invalid bit and a read-only bit.
*/
/*xwr*/
#define __P000 PAGE_NONE
-#define __P001 PAGE_RO
-#define __P010 PAGE_RO
-#define __P011 PAGE_RO
-#define __P100 PAGE_RO
-#define __P101 PAGE_RO
-#define __P110 PAGE_RO
-#define __P111 PAGE_RO
+#define __P001 PAGE_READ
+#define __P010 PAGE_READ
+#define __P011 PAGE_READ
+#define __P100 PAGE_READ
+#define __P101 PAGE_READ
+#define __P110 PAGE_READ
+#define __P111 PAGE_READ
#define __S000 PAGE_NONE
-#define __S001 PAGE_RO
-#define __S010 PAGE_RW
-#define __S011 PAGE_RW
-#define __S100 PAGE_RO
-#define __S101 PAGE_RO
-#define __S110 PAGE_RW
-#define __S111 PAGE_RW
+#define __S001 PAGE_READ
+#define __S010 PAGE_WRITE
+#define __S011 PAGE_WRITE
+#define __S100 PAGE_READ
+#define __S101 PAGE_READ
+#define __S110 PAGE_WRITE
+#define __S111 PAGE_WRITE
/*
* Segment entry (large page) protection definitions.
*/
-#define SEGMENT_NONE __pgprot(_HPAGE_TYPE_NONE)
-#define SEGMENT_RO __pgprot(_HPAGE_TYPE_RO)
-#define SEGMENT_RW __pgprot(_HPAGE_TYPE_RW)
-
-static inline int mm_exclusive(struct mm_struct *mm)
-{
- return likely(mm == current->active_mm &&
- atomic_read(&mm->context.attach_count) <= 1);
-}
+#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_INVALID | \
+ _SEGMENT_ENTRY_NONE)
+#define SEGMENT_READ __pgprot(_SEGMENT_ENTRY_INVALID | \
+ _SEGMENT_ENTRY_PROTECT)
+#define SEGMENT_WRITE __pgprot(_SEGMENT_ENTRY_INVALID)
static inline int mm_has_pgste(struct mm_struct *mm)
{
{
if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2)
return 0;
- return (pgd_val(pgd) & _REGION_ENTRY_INV) != 0UL;
+ return (pgd_val(pgd) & _REGION_ENTRY_INVALID) != 0UL;
}
static inline int pgd_bad(pgd_t pgd)
* invalid for either table entry.
*/
unsigned long mask =
- ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INV &
+ ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
return (pgd_val(pgd) & mask) != 0;
}
{
if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
return 0;
- return (pud_val(pud) & _REGION_ENTRY_INV) != 0UL;
+ return (pud_val(pud) & _REGION_ENTRY_INVALID) != 0UL;
}
static inline int pud_large(pud_t pud)
* invalid for either table entry.
*/
unsigned long mask =
- ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INV &
+ ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
return (pud_val(pud) & mask) != 0;
}
static inline int pmd_present(pmd_t pmd)
{
- unsigned long mask = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO;
- return (pmd_val(pmd) & mask) == _HPAGE_TYPE_NONE ||
- !(pmd_val(pmd) & _SEGMENT_ENTRY_INV);
+ return pmd_val(pmd) != _SEGMENT_ENTRY_INVALID;
}
static inline int pmd_none(pmd_t pmd)
{
- return (pmd_val(pmd) & _SEGMENT_ENTRY_INV) &&
- !(pmd_val(pmd) & _SEGMENT_ENTRY_RO);
+ return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
}
static inline int pmd_large(pmd_t pmd)
{
#ifdef CONFIG_64BIT
- return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE);
+ return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
#else
return 0;
#endif
}
+static inline int pmd_prot_none(pmd_t pmd)
+{
+ return (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) &&
+ (pmd_val(pmd) & _SEGMENT_ENTRY_NONE);
+}
+
static inline int pmd_bad(pmd_t pmd)
{
- unsigned long mask = ~_SEGMENT_ENTRY_ORIGIN & ~_SEGMENT_ENTRY_INV;
- return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY;
+#ifdef CONFIG_64BIT
+ if (pmd_large(pmd))
+ return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
+#endif
+ return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
}
#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
- return (pmd_val(pmd) & _SEGMENT_ENTRY_RO) == 0;
+ if (pmd_prot_none(pmd))
+ return 0;
+ return (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT) == 0;
}
static inline int pmd_young(pmd_t pmd)
{
- return 0;
+ int young = 0;
+#ifdef CONFIG_64BIT
+ if (pmd_prot_none(pmd))
+ young = (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT) != 0;
+ else
+ young = (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
+#endif
+ return young;
}
-static inline int pte_none(pte_t pte)
+static inline int pte_present(pte_t pte)
{
- return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT);
+ /* Bit pattern: (pte & 0x001) == 0x001 */
+ return (pte_val(pte) & _PAGE_PRESENT) != 0;
}
-static inline int pte_present(pte_t pte)
+static inline int pte_none(pte_t pte)
{
- unsigned long mask = _PAGE_RO | _PAGE_INVALID | _PAGE_SWT | _PAGE_SWX;
- return (pte_val(pte) & mask) == _PAGE_TYPE_NONE ||
- (!(pte_val(pte) & _PAGE_INVALID) &&
- !(pte_val(pte) & _PAGE_SWT));
+ /* Bit pattern: pte == 0x400 */
+ return pte_val(pte) == _PAGE_INVALID;
}
static inline int pte_file(pte_t pte)
{
- unsigned long mask = _PAGE_RO | _PAGE_INVALID | _PAGE_SWT;
- return (pte_val(pte) & mask) == _PAGE_TYPE_FILE;
+ /* Bit pattern: (pte & 0x601) == 0x600 */
+ return (pte_val(pte) & (_PAGE_INVALID | _PAGE_PROTECT | _PAGE_PRESENT))
+ == (_PAGE_INVALID | _PAGE_PROTECT);
}
static inline int pte_special(pte_t pte)
#endif
}
+static inline pgste_t pgste_get(pte_t *ptep)
+{
+ unsigned long pgste = 0;
+#ifdef CONFIG_PGSTE
+ pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
+#endif
+ return __pgste(pgste);
+}
+
static inline void pgste_set(pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
- unsigned long address, bits;
- unsigned char skey;
+ unsigned long address, bits, skey;
if (pte_val(*ptep) & _PAGE_INVALID)
return pgste;
address = pte_val(*ptep) & PAGE_MASK;
- skey = page_get_storage_key(address);
+ skey = (unsigned long) page_get_storage_key(address);
bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
- /* Clear page changed & referenced bit in the storage key */
- if (bits & _PAGE_CHANGED)
+ if (!(pgste_val(pgste) & PGSTE_HC_BIT) && (bits & _PAGE_CHANGED)) {
+ /* Transfer dirty + referenced bit to host bits in pgste */
+ pgste_val(pgste) |= bits << 52;
page_set_storage_key(address, skey ^ bits, 0);
- else if (bits)
+ } else if (!(pgste_val(pgste) & PGSTE_HR_BIT) &&
+ (bits & _PAGE_REFERENCED)) {
+ /* Transfer referenced bit to host bit in pgste */
+ pgste_val(pgste) |= PGSTE_HR_BIT;
page_reset_referenced(address);
+ }
/* Transfer page changed & referenced bit to guest bits in pgste */
pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */
- /* Get host changed & referenced bits from pgste */
- bits |= (pgste_val(pgste) & (PGSTE_HR_BIT | PGSTE_HC_BIT)) >> 52;
- /* Transfer page changed & referenced bit to kvm user bits */
- pgste_val(pgste) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */
- /* Clear relevant host bits in pgste. */
- pgste_val(pgste) &= ~(PGSTE_HR_BIT | PGSTE_HC_BIT);
- pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
/* Copy page access key and fetch protection bit to pgste */
- pgste_val(pgste) |=
- (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
- /* Transfer referenced bit to pte */
- pte_val(*ptep) |= (bits & _PAGE_REFERENCED) << 1;
+ pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
#endif
return pgste;
static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
- int young;
-
if (pte_val(*ptep) & _PAGE_INVALID)
return pgste;
/* Get referenced bit from storage key */
- young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
- if (young)
- pgste_val(pgste) |= PGSTE_GR_BIT;
- /* Get host referenced bit from pgste */
- if (pgste_val(pgste) & PGSTE_HR_BIT) {
- pgste_val(pgste) &= ~PGSTE_HR_BIT;
- young = 1;
- }
- /* Transfer referenced bit to kvm user bits and pte */
- if (young) {
- pgste_val(pgste) |= PGSTE_UR_BIT;
- pte_val(*ptep) |= _PAGE_SWR;
- }
+ if (page_reset_referenced(pte_val(*ptep) & PAGE_MASK))
+ pgste_val(pgste) |= PGSTE_HR_BIT | PGSTE_GR_BIT;
#endif
return pgste;
}
static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
{
- if (!MACHINE_HAS_ESOP && (pte_val(entry) & _PAGE_SWW)) {
+ if (!MACHINE_HAS_ESOP && (pte_val(entry) & _PAGE_WRITE)) {
/*
* Without enhanced suppression-on-protection force
* the dirty bit on for all writable ptes.
*/
- pte_val(entry) |= _PAGE_SWC;
- pte_val(entry) &= ~_PAGE_RO;
+ pte_val(entry) |= _PAGE_DIRTY;
+ pte_val(entry) &= ~_PAGE_PROTECT;
}
*ptep = entry;
}
*/
static inline int pte_write(pte_t pte)
{
- return (pte_val(pte) & _PAGE_SWW) != 0;
+ return (pte_val(pte) & _PAGE_WRITE) != 0;
}
static inline int pte_dirty(pte_t pte)
{
- return (pte_val(pte) & _PAGE_SWC) != 0;
+ return (pte_val(pte) & _PAGE_DIRTY) != 0;
}
static inline int pte_young(pte_t pte)
{
-#ifdef CONFIG_PGSTE
- if (pte_val(pte) & _PAGE_SWR)
- return 1;
-#endif
- return 0;
+ return (pte_val(pte) & _PAGE_YOUNG) != 0;
}
/*
static inline void pmd_clear(pmd_t *pmdp)
{
- pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ pmd_val(*pmdp) = _SEGMENT_ENTRY_INVALID;
}
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ pte_val(*ptep) = _PAGE_INVALID;
}
/*
{
pte_val(pte) &= _PAGE_CHG_MASK;
pte_val(pte) |= pgprot_val(newprot);
- if ((pte_val(pte) & _PAGE_SWC) && (pte_val(pte) & _PAGE_SWW))
- pte_val(pte) &= ~_PAGE_RO;
+ /*
+ * newprot for PAGE_NONE, PAGE_READ and PAGE_WRITE has the
+ * invalid bit set, clear it again for readable, young pages
+ */
+ if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ))
+ pte_val(pte) &= ~_PAGE_INVALID;
+ /*
+ * newprot for PAGE_READ and PAGE_WRITE has the page protection
+ * bit set, clear it again for writable, dirty pages
+ */
+ if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE))
+ pte_val(pte) &= ~_PAGE_PROTECT;
return pte;
}
static inline pte_t pte_wrprotect(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_SWW;
- /* Do not clobber _PAGE_TYPE_NONE pages! */
- if (!(pte_val(pte) & _PAGE_INVALID))
- pte_val(pte) |= _PAGE_RO;
+ pte_val(pte) &= ~_PAGE_WRITE;
+ pte_val(pte) |= _PAGE_PROTECT;
return pte;
}
static inline pte_t pte_mkwrite(pte_t pte)
{
- pte_val(pte) |= _PAGE_SWW;
- if (pte_val(pte) & _PAGE_SWC)
- pte_val(pte) &= ~_PAGE_RO;
+ pte_val(pte) |= _PAGE_WRITE;
+ if (pte_val(pte) & _PAGE_DIRTY)
+ pte_val(pte) &= ~_PAGE_PROTECT;
return pte;
}
static inline pte_t pte_mkclean(pte_t pte)
{
- pte_val(pte) &= ~_PAGE_SWC;
- /* Do not clobber _PAGE_TYPE_NONE pages! */
- if (!(pte_val(pte) & _PAGE_INVALID))
- pte_val(pte) |= _PAGE_RO;
+ pte_val(pte) &= ~_PAGE_DIRTY;
+ pte_val(pte) |= _PAGE_PROTECT;
return pte;
}
static inline pte_t pte_mkdirty(pte_t pte)
{
- pte_val(pte) |= _PAGE_SWC;
- if (pte_val(pte) & _PAGE_SWW)
- pte_val(pte) &= ~_PAGE_RO;
+ pte_val(pte) |= _PAGE_DIRTY;
+ if (pte_val(pte) & _PAGE_WRITE)
+ pte_val(pte) &= ~_PAGE_PROTECT;
return pte;
}
static inline pte_t pte_mkold(pte_t pte)
{
-#ifdef CONFIG_PGSTE
- pte_val(pte) &= ~_PAGE_SWR;
-#endif
+ pte_val(pte) &= ~_PAGE_YOUNG;
+ pte_val(pte) |= _PAGE_INVALID;
return pte;
}
static inline pte_t pte_mkyoung(pte_t pte)
{
+ pte_val(pte) |= _PAGE_YOUNG;
+ if (pte_val(pte) & _PAGE_READ)
+ pte_val(pte) &= ~_PAGE_INVALID;
return pte;
}
#ifdef CONFIG_HUGETLB_PAGE
static inline pte_t pte_mkhuge(pte_t pte)
{
- pte_val(pte) |= (_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_CO);
+ pte_val(pte) |= _PAGE_LARGE;
return pte;
}
#endif
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
pgste = pgste_update_all(ptep, pgste);
- dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
- pgste_val(pgste) &= ~PGSTE_UC_BIT;
+ dirty = !!(pgste_val(pgste) & PGSTE_HC_BIT);
+ pgste_val(pgste) &= ~PGSTE_HC_BIT;
pgste_set_unlock(ptep, pgste);
return dirty;
}
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
pgste = pgste_update_young(ptep, pgste);
- young = !!(pgste_val(pgste) & PGSTE_UR_BIT);
- pgste_val(pgste) &= ~PGSTE_UR_BIT;
+ young = !!(pgste_val(pgste) & PGSTE_HR_BIT);
+ pgste_val(pgste) &= ~PGSTE_HR_BIT;
pgste_set_unlock(ptep, pgste);
}
return young;
}
+static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
+{
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+#ifndef CONFIG_64BIT
+ /* pto must point to the start of the segment table */
+ pte_t *pto = (pte_t *) (((unsigned long) ptep) & 0x7ffffc00);
+#else
+ /* ipte in zarch mode can do the math */
+ pte_t *pto = ptep;
+#endif
+ asm volatile(
+ " ipte %2,%3"
+ : "=m" (*ptep) : "m" (*ptep),
+ "a" (pto), "a" (address));
+ }
+}
+
+static inline void ptep_flush_lazy(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
+{
+ int active = (mm == current->active_mm) ? 1 : 0;
+
+ if (atomic_read(&mm->context.attach_count) > active)
+ __ptep_ipte(address, ptep);
+ else
+ mm->context.flush_mm = 1;
+}
+
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pgste_t pgste;
pte_t pte;
+ int young;
if (mm_has_pgste(vma->vm_mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_update_young(ptep, pgste);
- pte = *ptep;
- *ptep = pte_mkold(pte);
- pgste_set_unlock(ptep, pgste);
- return pte_young(pte);
+ pgste = pgste_ipte_notify(vma->vm_mm, addr, ptep, pgste);
}
- return 0;
+
+ pte = *ptep;
+ __ptep_ipte(addr, ptep);
+ young = pte_young(pte);
+ pte = pte_mkold(pte);
+
+ if (mm_has_pgste(vma->vm_mm)) {
+ pgste_set_pte(ptep, pte);
+ pgste_set_unlock(ptep, pgste);
+ } else
+ *ptep = pte;
+
+ return young;
}
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- /* No need to flush TLB
- * On s390 reference bits are in storage key and never in TLB
- * With virtualization we handle the reference bit, without we
- * we can simply return */
return ptep_test_and_clear_young(vma, address, ptep);
}
-static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
-{
- if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-#ifndef CONFIG_64BIT
- /* pto must point to the start of the segment table */
- pte_t *pto = (pte_t *) (((unsigned long) ptep) & 0x7ffffc00);
-#else
- /* ipte in zarch mode can do the math */
- pte_t *pto = ptep;
-#endif
- asm volatile(
- " ipte %2,%3"
- : "=m" (*ptep) : "m" (*ptep),
- "a" (pto), "a" (address));
- }
-}
-
/*
* This is hard to understand. ptep_get_and_clear and ptep_clear_flush
* both clear the TLB for the unmapped pte. The reason is that
pgste_t pgste;
pte_t pte;
- mm->context.flush_mm = 1;
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
pgste = pgste_ipte_notify(mm, address, ptep, pgste);
}
pte = *ptep;
- if (!mm_exclusive(mm))
- __ptep_ipte(address, ptep);
- pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ ptep_flush_lazy(mm, address, ptep);
+ pte_val(*ptep) = _PAGE_INVALID;
if (mm_has_pgste(mm)) {
pgste = pgste_update_all(&pte, pgste);
pgste_t pgste;
pte_t pte;
- mm->context.flush_mm = 1;
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
pgste_ipte_notify(mm, address, ptep, pgste);
}
pte = *ptep;
- if (!mm_exclusive(mm))
- __ptep_ipte(address, ptep);
+ ptep_flush_lazy(mm, address, ptep);
+ pte_val(*ptep) |= _PAGE_INVALID;
if (mm_has_pgste(mm)) {
pgste = pgste_update_all(&pte, pgste);
pgste_t pgste;
if (mm_has_pgste(mm)) {
- pgste = *(pgste_t *)(ptep + PTRS_PER_PTE);
+ pgste = pgste_get(ptep);
pgste_set_key(ptep, pgste, pte);
pgste_set_pte(ptep, pte);
pgste_set_unlock(ptep, pgste);
pte = *ptep;
__ptep_ipte(address, ptep);
- pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ pte_val(*ptep) = _PAGE_INVALID;
if (mm_has_pgste(vma->vm_mm)) {
pgste = pgste_update_all(&pte, pgste);
pgste_t pgste;
pte_t pte;
- if (mm_has_pgste(mm)) {
+ if (!full && mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- if (!full)
- pgste = pgste_ipte_notify(mm, address, ptep, pgste);
+ pgste = pgste_ipte_notify(mm, address, ptep, pgste);
}
pte = *ptep;
if (!full)
- __ptep_ipte(address, ptep);
- pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ ptep_flush_lazy(mm, address, ptep);
+ pte_val(*ptep) = _PAGE_INVALID;
- if (mm_has_pgste(mm)) {
+ if (!full && mm_has_pgste(mm)) {
pgste = pgste_update_all(&pte, pgste);
pgste_set_unlock(ptep, pgste);
}
pte_t pte = *ptep;
if (pte_write(pte)) {
- mm->context.flush_mm = 1;
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
pgste = pgste_ipte_notify(mm, address, ptep, pgste);
}
- if (!mm_exclusive(mm))
- __ptep_ipte(address, ptep);
+ ptep_flush_lazy(mm, address, ptep);
pte = pte_wrprotect(pte);
if (mm_has_pgste(mm)) {
{
pte_t __pte;
pte_val(__pte) = physpage + pgprot_val(pgprot);
- return __pte;
+ return pte_mkyoung(__pte);
}
static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
unsigned long physpage = page_to_phys(page);
pte_t __pte = mk_pte_phys(physpage, pgprot);
- if ((pte_val(__pte) & _PAGE_SWW) && PageDirty(page)) {
- pte_val(__pte) |= _PAGE_SWC;
- pte_val(__pte) &= ~_PAGE_RO;
- }
+ if (pte_write(__pte) && PageDirty(page))
+ __pte = pte_mkdirty(__pte);
return __pte;
}
unsigned long sto = (unsigned long) pmdp -
pmd_index(address) * sizeof(pmd_t);
- if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
+ if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)) {
asm volatile(
" .insn rrf,0xb98e0000,%2,%3,0,0"
: "=m" (*pmdp)
}
}
+static inline void __pmd_csp(pmd_t *pmdp)
+{
+ register unsigned long reg2 asm("2") = pmd_val(*pmdp);
+ register unsigned long reg3 asm("3") = pmd_val(*pmdp) |
+ _SEGMENT_ENTRY_INVALID;
+ register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5;
+
+ asm volatile(
+ " csp %1,%3"
+ : "=m" (*pmdp)
+ : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
+}
+
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
{
/*
- * pgprot is PAGE_NONE, PAGE_RO, or PAGE_RW (see __Pxxx / __Sxxx)
+ * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
* Convert to segment table entry format.
*/
if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
return pgprot_val(SEGMENT_NONE);
- if (pgprot_val(pgprot) == pgprot_val(PAGE_RO))
- return pgprot_val(SEGMENT_RO);
- return pgprot_val(SEGMENT_RW);
+ if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
+ return pgprot_val(SEGMENT_READ);
+ return pgprot_val(SEGMENT_WRITE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+#ifdef CONFIG_64BIT
+ if (pmd_prot_none(pmd)) {
+ pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
+ } else {
+ pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
+ pmd_val(pmd) &= ~_SEGMENT_ENTRY_INVALID;
+ }
+#endif
+ return pmd;
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+#ifdef CONFIG_64BIT
+ if (pmd_prot_none(pmd)) {
+ pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+ } else {
+ pmd_val(pmd) &= ~_SEGMENT_ENTRY_YOUNG;
+ pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
+ }
+#endif
+ return pmd;
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
+ int young;
+
+ young = pmd_young(pmd);
pmd_val(pmd) &= _SEGMENT_CHG_MASK;
pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+ if (young)
+ pmd = pmd_mkyoung(pmd);
return pmd;
}
{
pmd_t __pmd;
pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
- return __pmd;
+ return pmd_mkyoung(__pmd);
}
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
- /* Do not clobber _HPAGE_TYPE_NONE pages! */
- if (!(pmd_val(pmd) & _SEGMENT_ENTRY_INV))
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO;
+ /* Do not clobber PROT_NONE segments! */
+ if (!pmd_prot_none(pmd))
+ pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
return pmd;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
+ static inline void pmdp_flush_lazy(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
+ {
+ int active = (mm == current->active_mm) ? 1 : 0;
+
+ if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
+ __pmd_idte(address, pmdp);
+ else
+ mm->context.flush_mm = 1;
+ }
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PGTABLE_DEPOSIT
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t entry)
{
- if (!(pmd_val(entry) & _SEGMENT_ENTRY_INV) && MACHINE_HAS_EDAT1)
+ if (!(pmd_val(entry) & _SEGMENT_ENTRY_INVALID) && MACHINE_HAS_EDAT1)
pmd_val(entry) |= _SEGMENT_ENTRY_CO;
*pmdp = entry;
}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
- pmd_val(pmd) |= _SEGMENT_ENTRY_RO;
+ /* Do not clobber PROT_NONE segments! */
+ if (!pmd_prot_none(pmd))
+ pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
return pmd;
}
return pmd;
}
-static inline pmd_t pmd_mkold(pmd_t pmd)
-{
- /* No referenced bit in the segment table entry. */
- return pmd;
-}
-
-static inline pmd_t pmd_mkyoung(pmd_t pmd)
-{
- /* No referenced bit in the segment table entry. */
- return pmd;
-}
-
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
- unsigned long pmd_addr = pmd_val(*pmdp) & HPAGE_MASK;
- long tmp, rc;
- int counter;
+ pmd_t pmd;
- rc = 0;
- if (MACHINE_HAS_RRBM) {
- counter = PTRS_PER_PTE >> 6;
- asm volatile(
- "0: .insn rre,0xb9ae0000,%0,%3\n" /* rrbm */
- " ogr %1,%0\n"
- " la %3,0(%4,%3)\n"
- " brct %2,0b\n"
- : "=&d" (tmp), "+&d" (rc), "+d" (counter),
- "+a" (pmd_addr)
- : "a" (64 * 4096UL) : "cc");
- rc = !!rc;
- } else {
- counter = PTRS_PER_PTE;
- asm volatile(
- "0: rrbe 0,%2\n"
- " la %2,0(%3,%2)\n"
- " brc 12,1f\n"
- " lhi %0,1\n"
- "1: brct %1,0b\n"
- : "+d" (rc), "+d" (counter), "+a" (pmd_addr)
- : "a" (4096UL) : "cc");
- }
- return rc;
+ pmd = *pmdp;
+ __pmd_idte(address, pmdp);
+ *pmdp = pmd_mkold(pmd);
+ return pmd_young(pmd);
}
#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
* exception will occur instead of a page translation exception. The
* specifiation exception has the bad habit not to store necessary
* information in the lowcore.
- * Bit 21 and bit 22 are the page invalid bit and the page protection
- * bit. We set both to indicate a swapped page.
- * Bit 30 and 31 are used to distinguish the different page types. For
- * a swapped page these bits need to be zero.
+ * Bits 21, 22, 30 and 31 are used to indicate the page type.
+ * A swap pte is indicated by bit pattern (pte & 0x603) == 0x402
* This leaves the bits 1-19 and bits 24-29 to store type and offset.
* We use the 5 bits from 25-29 for the type and the 20 bits from 1-19
* plus 24 for the offset.
* exception will occur instead of a page translation exception. The
* specifiation exception has the bad habit not to store necessary
* information in the lowcore.
- * Bit 53 and bit 54 are the page invalid bit and the page protection
- * bit. We set both to indicate a swapped page.
- * Bit 62 and 63 are used to distinguish the different page types. For
- * a swapped page these bits need to be zero.
+ * Bits 53, 54, 62 and 63 are used to indicate the page type.
+ * A swap pte is indicated by bit pattern (pte & 0x603) == 0x402
* This leaves the bits 0-51 and bits 56-61 to store type and offset.
* We use the 5 bits from 57-61 for the type and the 53 bits from 0-51
* plus 56 for the offset.
{
pte_t pte;
offset &= __SWP_OFFSET_MASK;
- pte_val(pte) = _PAGE_TYPE_SWAP | ((type & 0x1f) << 2) |
+ pte_val(pte) = _PAGE_INVALID | _PAGE_TYPE | ((type & 0x1f) << 2) |
((offset & 1UL) << 7) | ((offset & ~1UL) << 11);
return pte;
}
#define pgoff_to_pte(__off) \
((pte_t) { ((((__off) & 0x7f) << 1) + (((__off) >> 7) << 12)) \
- | _PAGE_TYPE_FILE })
+ | _PAGE_INVALID | _PAGE_PROTECT })
#endif /* !__ASSEMBLY__ */
#ifndef CONFIG_64BIT
#define TASK_SIZE (1UL << 31)
+ #define TASK_MAX_SIZE (1UL << 31)
#define TASK_UNMAPPED_BASE (1UL << 30)
#else /* CONFIG_64BIT */
#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \
(1UL << 30) : (1UL << 41))
#define TASK_SIZE TASK_SIZE_OF(current)
+ #define TASK_MAX_SIZE (1UL << 53)
#endif /* CONFIG_64BIT */
#endif
};
-#define PER_FLAG_NO_TE 1UL /* Flag to disable transactions. */
+/* Flag to disable transactions. */
+#define PER_FLAG_NO_TE 1UL
+/* Flag to enable random transaction aborts. */
+#define PER_FLAG_TE_ABORT_RAND 2UL
+/* Flag to specify random transaction abort mode:
+ * - abort each transaction at a random instruction before TEND if set.
+ * - abort random transactions at a random instruction if cleared.
+ */
+#define PER_FLAG_TE_ABORT_RAND_TEND 4UL
typedef struct thread_struct thread_struct;
#include <asm/pgtable.h>
#include <asm/nmi.h>
#include <asm/switch_to.h>
+ #include <asm/facility.h>
#include <asm/sclp.h>
#include "kvm-s390.h"
#include "gaccess.h"
{ NULL }
};
- static unsigned long long *facilities;
+ unsigned long *vfacilities;
static struct gmap_notifier gmap_notifier;
+ /* test availability of vfacility */
+ static inline int test_vfacility(unsigned long nr)
+ {
+ return __test_facility(nr, (void *) vfacilities);
+ }
+
/* Section: not file related */
int kvm_arch_hardware_enable(void *garbage)
{
vcpu->arch.sie_block->ecb = 6;
vcpu->arch.sie_block->ecb2 = 8;
vcpu->arch.sie_block->eca = 0xC1002001U;
- vcpu->arch.sie_block->fac = (int) (long) facilities;
+ vcpu->arch.sie_block->fac = (int) (long) vfacilities;
hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
(unsigned long) vcpu);
return rc;
vcpu->arch.sie_block->icptcode = 0;
- preempt_disable();
- kvm_guest_enter();
- preempt_enable();
VCPU_EVENT(vcpu, 6, "entering sie flags %x",
atomic_read(&vcpu->arch.sie_block->cpuflags));
trace_kvm_s390_sie_enter(vcpu,
atomic_read(&vcpu->arch.sie_block->cpuflags));
+
+ /*
+ * As PF_VCPU will be used in fault handler, between guest_enter
+ * and guest_exit should be no uaccess.
+ */
+ preempt_disable();
+ kvm_guest_enter();
+ preempt_enable();
rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
+ kvm_guest_exit();
+
+ VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
+ vcpu->arch.sie_block->icptcode);
+ trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
+
if (rc > 0)
rc = 0;
if (rc < 0) {
rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
}
}
- VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
- vcpu->arch.sie_block->icptcode);
- trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
- kvm_guest_exit();
memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
return rc;
return 0;
}
+ void kvm_arch_memslots_updated(struct kvm *kvm)
+ {
+ }
+
/* Section: memory related */
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
* to hold the maximum amount of facilities. On the other hand, we
* only set facilities that are known to work in KVM.
*/
- facilities = (unsigned long long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
- if (!facilities) {
+ vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
+ if (!vfacilities) {
kvm_exit();
return -ENOMEM;
}
- memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
- facilities[0] &= 0xff82fff3f47c0000ULL;
- facilities[1] &= 0x001c000000000000ULL;
+ memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
+ vfacilities[0] &= 0xff82fff3f47c0000UL;
+ vfacilities[1] &= 0x001c000000000000UL;
return 0;
}
static void __exit kvm_s390_exit(void)
{
- free_page((unsigned long) facilities);
+ free_page((unsigned long) vfacilities);
kvm_exit();
}
#include <linux/errno.h>
#include <linux/compat.h>
#include <asm/asm-offsets.h>
+#include <asm/facility.h>
#include <asm/current.h>
#include <asm/debug.h>
#include <asm/ebcdic.h>
kfree(inti);
no_interrupt:
/* Set condition code and we're done. */
- vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
- vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
+ kvm_s390_set_psw_cc(vcpu, cc);
return 0;
}
* Set condition code 3 to stop the guest from issueing channel
* I/O instructions.
*/
- vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
- vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+ kvm_s390_set_psw_cc(vcpu, 3);
return 0;
}
}
static int handle_stfl(struct kvm_vcpu *vcpu)
{
- unsigned int facility_list;
int rc;
vcpu->stat.instruction_stfl++;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- /* only pass the facility bits, which we can handle */
- facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
-
rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
- &facility_list, sizeof(facility_list));
+ vfacilities, 4);
if (rc)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
- trace_kvm_s390_handle_stfl(vcpu, facility_list);
+ VCPU_EVENT(vcpu, 5, "store facility list value %x",
+ *(unsigned int *) vfacilities);
+ trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
return 0;
}
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
if (fc > 3) {
- vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; /* cc 3 */
+ kvm_s390_set_psw_cc(vcpu, 3);
return 0;
}
if (fc == 0) {
vcpu->run->s.regs.gprs[0] = 3 << 28;
- vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); /* cc 0 */
+ kvm_s390_set_psw_cc(vcpu, 0);
return 0;
}
}
trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
free_page(mem);
- vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+ kvm_s390_set_psw_cc(vcpu, 0);
vcpu->run->s.regs.gprs[0] = 0;
return 0;
out_no_data:
- /* condition code 3 */
- vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
+ kvm_s390_set_psw_cc(vcpu, 3);
out_exception:
free_page(mem);
return rc;
kvm_s390_get_regs_rre(vcpu, ®1, ®2);
/* This basically extracts the mask half of the psw. */
- vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
+ vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL;
vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
if (reg2) {
- vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
+ vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL;
vcpu->run->s.regs.gprs[reg2] |=
- vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
+ vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL;
}
return 0;
}
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
/* Only provide non-quiescing support if the host supports it */
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ &&
- S390_lowcore.stfl_fac_list & 0x00020000)
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && !test_facility(14))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
/* No support for conditional-SSKE */
struct gmap_rmap *rmap;
struct page *page;
- if (*table & _SEGMENT_ENTRY_INV)
+ if (*table & _SEGMENT_ENTRY_INVALID)
return 0;
page = pfn_to_page(*table >> PAGE_SHIFT);
mp = (struct gmap_pgtable *) page->index;
kfree(rmap);
break;
}
- *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
+ *table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT;
return 1;
}
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
crst_table_init(new, init);
- if (*table & _REGION_ENTRY_INV) {
+ if (*table & _REGION_ENTRY_INVALID) {
list_add(&page->lru, &gmap->crst_list);
*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
for (off = 0; off < len; off += PMD_SIZE) {
/* Walk the guest addr space page table */
table = gmap->table + (((to + off) >> 53) & 0x7ff);
- if (*table & _REGION_ENTRY_INV)
+ if (*table & _REGION_ENTRY_INVALID)
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + (((to + off) >> 42) & 0x7ff);
- if (*table & _REGION_ENTRY_INV)
+ if (*table & _REGION_ENTRY_INVALID)
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + (((to + off) >> 31) & 0x7ff);
- if (*table & _REGION_ENTRY_INV)
+ if (*table & _REGION_ENTRY_INVALID)
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + (((to + off) >> 20) & 0x7ff);
/* Clear segment table entry in guest address space. */
flush |= gmap_unlink_segment(gmap, table);
- *table = _SEGMENT_ENTRY_INV;
+ *table = _SEGMENT_ENTRY_INVALID;
}
out:
spin_unlock(&gmap->mm->page_table_lock);
if ((from | to | len) & (PMD_SIZE - 1))
return -EINVAL;
- if (len == 0 || from + len > PGDIR_SIZE ||
+ if (len == 0 || from + len > TASK_MAX_SIZE ||
from + len < from || to + len < to)
return -EINVAL;
for (off = 0; off < len; off += PMD_SIZE) {
/* Walk the gmap address space page table */
table = gmap->table + (((to + off) >> 53) & 0x7ff);
- if ((*table & _REGION_ENTRY_INV) &&
+ if ((*table & _REGION_ENTRY_INVALID) &&
gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
goto out_unmap;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + (((to + off) >> 42) & 0x7ff);
- if ((*table & _REGION_ENTRY_INV) &&
+ if ((*table & _REGION_ENTRY_INVALID) &&
gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
goto out_unmap;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + (((to + off) >> 31) & 0x7ff);
- if ((*table & _REGION_ENTRY_INV) &&
+ if ((*table & _REGION_ENTRY_INVALID) &&
gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
goto out_unmap;
table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
/* Store 'from' address in an invalid segment table entry. */
flush |= gmap_unlink_segment(gmap, table);
- *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
+ *table = (from + off) | (_SEGMENT_ENTRY_INVALID |
+ _SEGMENT_ENTRY_PROTECT);
}
spin_unlock(&gmap->mm->page_table_lock);
up_read(&gmap->mm->mmap_sem);
unsigned long *table;
table = gmap->table + ((address >> 53) & 0x7ff);
- if (unlikely(*table & _REGION_ENTRY_INV))
+ if (unlikely(*table & _REGION_ENTRY_INVALID))
return ERR_PTR(-EFAULT);
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + ((address >> 42) & 0x7ff);
- if (unlikely(*table & _REGION_ENTRY_INV))
+ if (unlikely(*table & _REGION_ENTRY_INVALID))
return ERR_PTR(-EFAULT);
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + ((address >> 31) & 0x7ff);
- if (unlikely(*table & _REGION_ENTRY_INV))
+ if (unlikely(*table & _REGION_ENTRY_INVALID))
return ERR_PTR(-EFAULT);
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + ((address >> 20) & 0x7ff);
return PTR_ERR(segment_ptr);
/* Convert the gmap address to an mm address. */
segment = *segment_ptr;
- if (!(segment & _SEGMENT_ENTRY_INV)) {
+ if (!(segment & _SEGMENT_ENTRY_INVALID)) {
page = pfn_to_page(segment >> PAGE_SHIFT);
mp = (struct gmap_pgtable *) page->index;
return mp->vmaddr | (address & ~PMD_MASK);
- } else if (segment & _SEGMENT_ENTRY_RO) {
+ } else if (segment & _SEGMENT_ENTRY_PROTECT) {
vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
return vmaddr | (address & ~PMD_MASK);
}
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
mp = (struct gmap_pgtable *) page->index;
list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
- *rmap->entry =
- _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
+ *rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID |
+ _SEGMENT_ENTRY_PROTECT);
list_del(&rmap->list);
kfree(rmap);
flush = 1;
/* Convert the gmap address to an mm address. */
while (1) {
segment = *segment_ptr;
- if (!(segment & _SEGMENT_ENTRY_INV)) {
+ if (!(segment & _SEGMENT_ENTRY_INVALID)) {
/* Page table is present */
page = pfn_to_page(segment >> PAGE_SHIFT);
mp = (struct gmap_pgtable *) page->index;
return mp->vmaddr | (address & ~PMD_MASK);
}
- if (!(segment & _SEGMENT_ENTRY_RO))
+ if (!(segment & _SEGMENT_ENTRY_PROTECT))
/* Nothing mapped in the gmap address space. */
break;
rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
while (address < to) {
/* Walk the gmap address space page table */
table = gmap->table + ((address >> 53) & 0x7ff);
- if (unlikely(*table & _REGION_ENTRY_INV)) {
+ if (unlikely(*table & _REGION_ENTRY_INVALID)) {
address = (address + PMD_SIZE) & PMD_MASK;
continue;
}
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + ((address >> 42) & 0x7ff);
- if (unlikely(*table & _REGION_ENTRY_INV)) {
+ if (unlikely(*table & _REGION_ENTRY_INVALID)) {
address = (address + PMD_SIZE) & PMD_MASK;
continue;
}
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + ((address >> 31) & 0x7ff);
- if (unlikely(*table & _REGION_ENTRY_INV)) {
+ if (unlikely(*table & _REGION_ENTRY_INVALID)) {
address = (address + PMD_SIZE) & PMD_MASK;
continue;
}
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
table = table + ((address >> 20) & 0x7ff);
- if (unlikely(*table & _SEGMENT_ENTRY_INV)) {
+ if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) {
address = (address + PMD_SIZE) & PMD_MASK;
continue;
}
continue;
/* Set notification bit in the pgste of the pte */
entry = *ptep;
- if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) {
+ if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
pgste = pgste_get_lock(ptep);
pgste_val(pgste) |= PGSTE_IN_BIT;
pgste_set_unlock(ptep, pgste);
spin_unlock(&gmap_notifier_lock);
}
+ static inline int page_table_with_pgste(struct page *page)
+ {
+ return atomic_read(&page->_mapcount) == 0;
+ }
+
static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
unsigned long vmaddr)
{
mp->vmaddr = vmaddr & PMD_MASK;
INIT_LIST_HEAD(&mp->mapper);
page->index = (unsigned long) mp;
- atomic_set(&page->_mapcount, 3);
+ atomic_set(&page->_mapcount, 0);
table = (unsigned long *) page_to_phys(page);
- clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
- clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+ clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+ clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
+ PAGE_SIZE/2);
return table;
}
pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
- unsigned long address, bits;
- unsigned char skey;
+ unsigned long address, bits, skey;
address = pte_val(*ptep) & PAGE_MASK;
- skey = page_get_storage_key(address);
+ skey = (unsigned long) page_get_storage_key(address);
bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+ skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
/* Set storage key ACC and FP */
- page_set_storage_key(address,
- (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)),
- !nq);
-
+ page_set_storage_key(address, skey, !nq);
/* Merge host changed & referenced into pgste */
pgste_val(new) |= bits << 52;
- /* Transfer skey changed & referenced bit to kvm user bits */
- pgste_val(new) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */
}
/* changing the guest storage key is considered a change of the page */
if ((pgste_val(new) ^ pgste_val(old)) &
(PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
- pgste_val(new) |= PGSTE_UC_BIT;
+ pgste_val(new) |= PGSTE_HC_BIT;
pgste_set_unlock(ptep, new);
pte_unmap_unlock(*ptep, ptl);
#else /* CONFIG_PGSTE */
+ static inline int page_table_with_pgste(struct page *page)
+ {
+ return 0;
+ }
+
static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
unsigned long vmaddr)
{
pgtable_page_ctor(page);
atomic_set(&page->_mapcount, 1);
table = (unsigned long *) page_to_phys(page);
- clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
+ clear_table(table, _PAGE_INVALID, PAGE_SIZE);
spin_lock_bh(&mm->context.list_lock);
list_add(&page->lru, &mm->context.pgtable_list);
} else {
struct page *page;
unsigned int bit, mask;
- if (mm_has_pgste(mm)) {
+ page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ if (page_table_with_pgste(page)) {
gmap_disconnect_pgtable(mm, table);
return page_table_free_pgste(table);
}
/* Free 1K/2K page table fragment of a 4K page */
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
spin_lock_bh(&mm->context.list_lock);
if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
unsigned int bit, mask;
mm = tlb->mm;
- if (mm_has_pgste(mm)) {
+ page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ if (page_table_with_pgste(page)) {
gmap_disconnect_pgtable(mm, table);
table = (unsigned long *) (__pa(table) | FRAG_MASK);
tlb_remove_table(tlb, table);
return;
}
bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
spin_lock_bh(&mm->context.list_lock);
if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
list_del(&page->lru);
struct mmu_table_batch **batch = &tlb->batch;
if (*batch) {
- __tlb_flush_mm(tlb->mm);
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL;
}
{
struct mmu_table_batch **batch = &tlb->batch;
+ tlb->mm->context.flush_mm = 1;
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
- __tlb_flush_mm(tlb->mm);
+ __tlb_flush_mm_lazy(tlb->mm);
tlb_remove_table_one(table);
return;
}
}
(*batch)->tables[(*batch)->nr++] = table;
if ((*batch)->nr == MAX_TABLE_BATCH)
- tlb_table_flush(tlb);
+ tlb_flush_mmu(tlb);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- void thp_split_vma(struct vm_area_struct *vma)
+ static inline void thp_split_vma(struct vm_area_struct *vma)
{
unsigned long addr;
- struct page *page;
- for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
- page = follow_page(vma, addr, FOLL_SPLIT);
- }
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
+ follow_page(vma, addr, FOLL_SPLIT);
}
- void thp_split_mm(struct mm_struct *mm)
+ static inline void thp_split_mm(struct mm_struct *mm)
{
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
- while (vma != NULL) {
+ for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
thp_split_vma(vma);
vma->vm_flags &= ~VM_HUGEPAGE;
vma->vm_flags |= VM_NOHUGEPAGE;
- vma = vma->vm_next;
}
+ mm->def_flags |= VM_NOHUGEPAGE;
+ }
+ #else
+ static inline void thp_split_mm(struct mm_struct *mm)
+ {
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
+ struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end)
+ {
+ unsigned long next, *table, *new;
+ struct page *page;
+ pmd_t *pmd;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ again:
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ table = (unsigned long *) pmd_deref(*pmd);
+ page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ if (page_table_with_pgste(page))
+ continue;
+ /* Allocate new page table with pgstes */
+ new = page_table_alloc_pgste(mm, addr);
+ if (!new) {
+ mm->context.has_pgste = 0;
+ continue;
+ }
+ spin_lock(&mm->page_table_lock);
+ if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
+ /* Nuke pmd entry pointing to the "short" page table */
+ pmdp_flush_lazy(mm, addr, pmd);
+ pmd_clear(pmd);
+ /* Copy ptes from old table to new table */
+ memcpy(new, table, PAGE_SIZE/2);
+ clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+ /* Establish new table */
+ pmd_populate(mm, pmd, (pte_t *) new);
+ /* Free old table with rcu, there might be a walker! */
+ page_table_free_rcu(tlb, table);
+ new = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+ if (new) {
+ page_table_free_pgste(new);
+ goto again;
+ }
+ } while (pmd++, addr = next, addr != end);
+
+ return addr;
+ }
+
+ static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
+ struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end)
+ {
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+
+ return addr;
+ }
+
+ static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
+ {
+ unsigned long next;
+ pgd_t *pgd;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+ }
+
/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
{
struct task_struct *tsk = current;
- struct mm_struct *mm, *old_mm;
+ struct mm_struct *mm = tsk->mm;
+ struct mmu_gather tlb;
/* Do we have switched amode? If no, we cannot do sie */
if (s390_user_mode == HOME_SPACE_MODE)
if (mm_has_pgste(tsk->mm))
return 0;
- /* lets check if we are allowed to replace the mm */
- task_lock(tsk);
- if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
- #ifdef CONFIG_AIO
- !hlist_empty(&tsk->mm->ioctx_list) ||
- #endif
- tsk->mm != tsk->active_mm) {
- task_unlock(tsk);
- return -EINVAL;
- }
- task_unlock(tsk);
-
- /* we copy the mm and let dup_mm create the page tables with_pgstes */
- tsk->mm->context.alloc_pgste = 1;
- /* make sure that both mms have a correct rss state */
- sync_mm_rss(tsk->mm);
- mm = dup_mm(tsk);
- tsk->mm->context.alloc_pgste = 0;
- if (!mm)
- return -ENOMEM;
-
- #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ down_write(&mm->mmap_sem);
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
- mm->def_flags |= VM_NOHUGEPAGE;
- #endif
-
- /* Now lets check again if something happened */
- task_lock(tsk);
- if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
- #ifdef CONFIG_AIO
- !hlist_empty(&tsk->mm->ioctx_list) ||
- #endif
- tsk->mm != tsk->active_mm) {
- mmput(mm);
- task_unlock(tsk);
- return -EINVAL;
- }
-
- /* ok, we are alone. No ptrace, no threads, etc. */
- old_mm = tsk->mm;
- tsk->mm = tsk->active_mm = mm;
- preempt_disable();
- update_mm(mm, tsk);
- atomic_inc(&mm->context.attach_count);
- atomic_dec(&old_mm->context.attach_count);
- cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
- preempt_enable();
- task_unlock(tsk);
- mmput(old_mm);
- return 0;
+ /* Reallocate the page tables with pgstes */
+ mm->context.has_pgste = 1;
- tlb_gather_mmu(&tlb, mm, 0);
++ tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
+ page_table_realloc(&tlb, mm, 0, TASK_SIZE);
- tlb_finish_mmu(&tlb, 0, -1);
++ tlb_finish_mmu(&tlb, 0, TASK_SIZE);
+ up_write(&mm->mmap_sem);
+ return mm->context.has_pgste ? 0 : -ENOMEM;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
list_del(lh);
}
ptep = (pte_t *) pgtable;
- pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ pte_val(*ptep) = _PAGE_INVALID;
ptep++;
- pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ pte_val(*ptep) = _PAGE_INVALID;
return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
extern void calc_global_load(unsigned long ticks);
extern void update_cpu_load_nohz(void);
- /* Notifier for when a task gets migrated to a new CPU */
- struct task_migration_notifier {
- struct task_struct *task;
- int from_cpu;
- int to_cpu;
- };
- extern void register_task_migration_notifier(struct notifier_block *n);
-
extern unsigned long get_parent_ip(unsigned long addr);
extern void dump_cpu_task(int cpu);
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
+ struct task_struct *last_wakee;
+ unsigned long wakee_flips;
+ unsigned long wakee_flip_decay_ts;
#endif
int on_rq;
* Test if a process is not yet dead (at most zombie state)
* If pid_alive fails, then pointers within the task structure
* can be stale and must not be dereferenced.
+ *
+ * Return: 1 if the process is alive. 0 otherwise.
*/
static inline int pid_alive(struct task_struct *p)
{
* @tsk: Task structure to be checked.
*
* Check if a task structure is the first user space task the kernel created.
+ *
+ * Return: 1 if the task structure is init. 0 otherwise.
*/
static inline int is_global_init(struct task_struct *tsk)
{
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
+#define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */
/*
* Only the _current_ task can read/write to tsk->flags, but other
/**
* is_idle_task - is the specified task an idle task?
* @p: the task in question.
+ *
+ * Return: 1 if @p is an idle task. 0 otherwise.
*/
static inline bool is_idle_task(const struct task_struct *p)
{
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
*/
inline int task_curr(const struct task_struct *p)
{
rq->skip_clock_update = 1;
}
- static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-
- void register_task_migration_notifier(struct notifier_block *n)
- {
- atomic_notifier_chain_register(&task_migration_notifier, n);
- }
-
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
- struct task_migration_notifier tmn;
-
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-
- tmn.task = p;
- tmn.from_cpu = task_cpu(p);
- tmn.to_cpu = new_cpu;
-
- atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
}
__set_task_cpu(p, new_cpu);
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
static int
unsigned long flags;
int cpu, success = 0;
- smp_wmb();
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
* @p: The process to be woken up.
*
* Attempt to wake up the nominated process and move it to the set of runnable
- * processes. Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even
* with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
*/
u64 scheduler_tick_max_deferment(void)
{
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
*/
asmlinkage void __sched notrace preempt_schedule(void)
{
- struct thread_info *ti = current_thread_info();
-
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
- if (likely(ti->preempt_count || irqs_disabled()))
+ if (likely(!preemptible()))
return;
do {
if (unlikely(!q))
return;
- if (unlikely(!nr_exclusive))
+ if (unlikely(nr_exclusive != 1))
wake_flags = 0;
spin_lock_irqsave(&q->lock, flags);
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
*
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible. The caller is accounted as waiting for IO.
*
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
* This waits for completion of a specific task to be signaled. It is
* interruptible.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
* try_wait_for_completion - try to decrement a completion without blocking
* @x: completion structure
*
- * Returns: 0 if a decrement cannot be done without blocking
+ * Return: 0 if a decrement cannot be done without blocking
* 1 if a decrement succeeded.
*
* If a completion is being used as a counting completion,
* completion_done - Test to see if a completion has any waiters
* @x: completion structure
*
- * Returns: 0 if there are waiters (wait_for_completion() in progress)
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
*/
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
/**
* task_nice - return the nice value of a given task.
* @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
*/
int task_nice(const struct task_struct *p)
{
/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int idle_cpu(int cpu)
{
/**
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
*/
struct task_struct *idle_task(int cpu)
{
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Return: 0 on success. An error code otherwise.
+ *
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
* current context has permission. For example, this is needed in
* stop_machine(): we create temporary high priority worker threads,
* but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
* @pid: the pid in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
struct sched_param __user *, param)
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
*/
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
* sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
*
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
*/
SYSCALL_DEFINE0(sched_yield)
{
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns:
+ * Return:
* true (>0) if we indeed boosted the target task.
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
* sys_sched_get_priority_min - return minimum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
*
* this syscall writes the default timeslice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
*/
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
debug_show_all_locks();
}
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
{
idle->sched_class = &idle_sched_class;
}
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
-static int __cpuinit
+static int
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
* happens before everything else. This has to be lower priority than
* the notifier in the perf_event subsystem, though.
*/
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
.notifier_call = migration_call,
.priority = CPU_PRI_MIGRATION,
};
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
}
}
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES);
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
* two cpus are in the same cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
int id = cpu;
+ int size = 1;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ }
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
}
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
;
}
+ n = ndoms_cur;
if (doms_new == NULL) {
- ndoms_cur = 0;
+ n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
WARN_ON_ONCE(dattr_new);
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < ndoms_cur && !new_topology; j++) {
+ for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
*/
struct task_struct *curr_task(int cpu)
{
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
lockdep_is_held(&tsk->sighand->siglock)),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
#ifdef CONFIG_CGROUP_SCHED
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
- struct task_group, css);
+ return css ? container_of(css, struct task_group, css) : NULL;
}
-static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct task_group *tg, *parent;
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
- if (!cgrp->parent) {
+ if (!parent) {
/* This is early initialization for the top cgroup */
return &root_task_group.css;
}
- parent = cgroup_tg(cgrp->parent);
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
return &tg->css;
}
-static int cpu_cgroup_css_online(struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
- struct task_group *parent;
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css_parent(css));
- if (!cgrp->parent)
- return 0;
-
- parent = cgroup_tg(cgrp->parent);
- sched_online_group(tg, parent);
+ if (parent)
+ sched_online_group(tg, parent);
return 0;
}
-static void cpu_cgroup_css_free(struct cgroup *cgrp)
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_destroy_group(tg);
}
-static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_offline_group(tg);
}
-static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
return 0;
}
-static void cpu_cgroup_attach(struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
}
-static void
-cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
- struct task_struct *task)
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 shareval)
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+ return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
return (u64) scale_load_down(tg->shares);
}
return cfs_period_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_quota(cgroup_tg(cgrp));
+ return tg_get_cfs_quota(css_tg(css));
}
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
- s64 cfs_quota_us)
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 cfs_quota_us)
{
- return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+ return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
}
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_period(cgroup_tg(cgrp));
+ return tg_get_cfs_period(css_tg(css));
}
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 cfs_period_us)
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_period_us)
{
- return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+ return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
struct cfs_schedulable_data {
return ret;
}
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
- s64 val)
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 val)
{
- return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ return sched_group_set_rt_runtime(css_tg(css), val);
}
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_runtime(cgroup_tg(cgrp));
+ return sched_group_rt_runtime(css_tg(css));
}
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_period_us)
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 rt_period_us)
{
- return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+ return sched_group_set_rt_period(css_tg(css), rt_period_us);
}
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_period(cgroup_tg(cgrp));
+ return sched_group_rt_period(css_tg(css));
}
#endif /* CONFIG_RT_GROUP_SCHED */