Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <[email protected]>

Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)

committer Linus Torvalds <[email protected]>

Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
author Linus Torvalds <[email protected]>
Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
committer Linus Torvalds <[email protected]>
Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
diff --combined arch/arm/configs/keystone_defconfig

index 62e968cac9dc132f66c7c2d397c689a19aeda295,62e968cac9dc132f66c7c2d397c689a19aeda295..1f36b823905f1d6e1c16eeb9e0554efc1678bcf0
--- 1/arch/arm/configs/keystone_defconfig
--- 2/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@@ -104,6 -104,6 +104,7 @@@ CONFIG_IP_SCTP=
   CONFIG_VLAN_8021Q=y
   CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
   CONFIG_CMA=y
++CONFIG_DMA_CMA=y
   CONFIG_MTD=y
   CONFIG_MTD_CMDLINE_PARTS=y
   CONFIG_MTD_BLOCK=y
diff --combined arch/arm/configs/omap2plus_defconfig

index 5339e6a4d639dccaca9144eb0df3101b692d0df2,5339e6a4d639dccaca9144eb0df3101b692d0df2..5465f564fdf3c8e5a9af125182f6fdfd983e5ab1
--- 1/arch/arm/configs/omap2plus_defconfig
--- 2/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@@ -78,6 -78,6 +78,7 @@@ CONFIG_MAC80211_RC_PID=
   CONFIG_MAC80211_RC_DEFAULT_PID=y
   CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
   CONFIG_CMA=y
++CONFIG_DMA_CMA=y
   CONFIG_CONNECTOR=y
   CONFIG_DEVTMPFS=y
   CONFIG_DEVTMPFS_MOUNT=y
diff --combined arch/arm/configs/tegra_defconfig

index 1effb43dab80833f2b0df2a520fc61f7f2db6028,1effb43dab80833f2b0df2a520fc61f7f2db6028..92d0a149aeb5e58094ad3d3765fec5e878065d4a
--- 1/arch/arm/configs/tegra_defconfig
--- 2/arch/arm/configs/tegra_defconfig
+++ b/arch/arm/configs/tegra_defconfig
@@@ -79,6 -79,6 +79,7 @@@ CONFIG_DEVTMPFS=
   CONFIG_DEVTMPFS_MOUNT=y
   # CONFIG_FIRMWARE_IN_KERNEL is not set
   CONFIG_CMA=y
++CONFIG_DMA_CMA=y
   CONFIG_MTD=y
   CONFIG_MTD_M25P80=y
   CONFIG_PROC_DEVICETREE=y
diff --combined arch/powerpc/kernel/asm-offsets.c

index 8207459efe5619e4b9e2d824221104d111507e67,26098c20936dc84cdd611e1382d5f910d8ee9675..d8958be5f31a18b7c0bae16509749067aae75265
--- 1/arch/powerpc/kernel/asm-offsets.c
--- 2/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@@ -138,9 -138,6 +138,9 @@@ int main(void
         DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar));
         DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr));
         DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar));
+ +      DEFINE(THREAD_TM_TAR, offsetof(struct thread_struct, tm_tar));
+ +      DEFINE(THREAD_TM_PPR, offsetof(struct thread_struct, tm_ppr));
+ +      DEFINE(THREAD_TM_DSCR, offsetof(struct thread_struct, tm_dscr));
         DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs));
         DEFINE(THREAD_TRANSACT_VR0, offsetof(struct thread_struct,
                                          transact_vr[0]));
@@@ -454,6 -451,7 +454,7 @@@
         DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
         DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
   #endif
+       DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
         DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
         DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
         DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
diff --combined arch/s390/include/asm/mmu_context.h

index 7b7fce4e846941832282adb57e760e49c506ac0a,4fb67a0e4ddf263c5b156759247f98249e913536..9f973d8de90ea91fbde12f11cff70c8470bf8a81
--- 1/arch/s390/include/asm/mmu_context.h
--- 2/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@@ -21,24 -21,7 +21,7 @@@ static inline int init_new_context(stru
   #ifdef CONFIG_64BIT
         mm->context.asce_bits |= _ASCE_TYPE_REGION3;
   #endif
-       if (current->mm && current->mm->context.alloc_pgste) {
-               /*
-                * alloc_pgste indicates, that any NEW context will be created
-                * with extended page tables. The old context is unchanged. The
-                * page table allocation and the page table operations will
-                * look at has_pgste to distinguish normal and extended page
-                * tables. The only way to create extended page tables is to
-                * set alloc_pgste and then create a new context (e.g. dup_mm).
-                * The page table allocation is called after init_new_context
-                * and if has_pgste is set, it will create extended page
-                * tables.
-                */
-               mm->context.has_pgste = 1;
-               mm->context.alloc_pgste = 1;
-       } else {
-               mm->context.has_pgste = 0;
-               mm->context.alloc_pgste = 0;
-       }
+       mm->context.has_pgste = 0;
         mm->context.asce_limit = STACK_TOP_MAX;
         crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
         return 0;
@@@ -77,7 -60,8 +60,7 @@@ static inline void switch_mm(struct mm_
         WARN_ON(atomic_read(&prev->context.attach_count) < 0);
         atomic_inc(&next->context.attach_count);
         /* Check for TLBs not flushed yet */
- -      if (next->context.flush_mm)
- -              __tlb_flush_mm(next);
+ +      __tlb_flush_mm_lazy(next);
   }
   
   #define enter_lazy_tlb(mm,tsk)        do { } while (0)
diff --combined arch/s390/include/asm/pgtable.h

index 9f215b40109e1c4d9df5bb0aa6da36e5be3213ba,7a60bb93e83c23ac1f3216b255abff6534d91cfa..9b60a36c348d5422dc325463bcb26efeee64161d
--- 1/arch/s390/include/asm/pgtable.h
--- 2/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@@ -217,57 -217,63 +217,57 @@@ extern unsigned long MODULES_END
   
   /* Hardware bits in the page table entry */
   #define _PAGE_CO      0x100           /* HW Change-bit override */
- -#define _PAGE_RO      0x200           /* HW read-only bit  */
+ +#define _PAGE_PROTECT 0x200           /* HW read-only bit  */
   #define _PAGE_INVALID 0x400           /* HW invalid bit    */
+ +#define _PAGE_LARGE   0x800           /* Bit to mark a large pte */
   
   /* Software bits in the page table entry */
- -#define _PAGE_SWT     0x001           /* SW pte type bit t */
- -#define _PAGE_SWX     0x002           /* SW pte type bit x */
- -#define _PAGE_SWC     0x004           /* SW pte changed bit */
- -#define _PAGE_SWR     0x008           /* SW pte referenced bit */
- -#define _PAGE_SWW     0x010           /* SW pte write bit */
- -#define _PAGE_SPECIAL 0x020           /* SW associated with special page */
+ +#define _PAGE_PRESENT 0x001           /* SW pte present bit */
+ +#define _PAGE_TYPE    0x002           /* SW pte type bit */
+ +#define _PAGE_YOUNG   0x004           /* SW pte young bit */
+ +#define _PAGE_DIRTY   0x008           /* SW pte dirty bit */
+ +#define _PAGE_READ    0x010           /* SW pte read bit */
+ +#define _PAGE_WRITE   0x020           /* SW pte write bit */
+ +#define _PAGE_SPECIAL 0x040           /* SW associated with special page */
   #define __HAVE_ARCH_PTE_SPECIAL
   
   /* Set of bits not changed in pte_modify */
   #define _PAGE_CHG_MASK                (PAGE_MASK | _PAGE_SPECIAL | _PAGE_CO | \
- -                               _PAGE_SWC | _PAGE_SWR)
- -
- -/* Six different types of pages. */
- -#define _PAGE_TYPE_EMPTY      0x400
- -#define _PAGE_TYPE_NONE               0x401
- -#define _PAGE_TYPE_SWAP               0x403
- -#define _PAGE_TYPE_FILE               0x601   /* bit 0x002 is used for offset !! */
- -#define _PAGE_TYPE_RO         0x200
- -#define _PAGE_TYPE_RW         0x000
- -
- -/*
- - * Only four types for huge pages, using the invalid bit and protection bit
- - * of a segment table entry.
- - */
- -#define _HPAGE_TYPE_EMPTY     0x020   /* _SEGMENT_ENTRY_INV */
- -#define _HPAGE_TYPE_NONE      0x220
- -#define _HPAGE_TYPE_RO                0x200   /* _SEGMENT_ENTRY_RO  */
- -#define _HPAGE_TYPE_RW                0x000
+ +                               _PAGE_DIRTY | _PAGE_YOUNG)
   
   /*
- - * PTE type bits are rather complicated. handle_pte_fault uses pte_present,
- - * pte_none and pte_file to find out the pte type WITHOUT holding the page
- - * table lock. ptep_clear_flush on the other hand uses ptep_clear_flush to
- - * invalidate a given pte. ipte sets the hw invalid bit and clears all tlbs
- - * for the page. The page table entry is set to _PAGE_TYPE_EMPTY afterwards.
- - * This change is done while holding the lock, but the intermediate step
- - * of a previously valid pte with the hw invalid bit set can be observed by
- - * handle_pte_fault. That makes it necessary that all valid pte types with
- - * the hw invalid bit set must be distinguishable from the four pte types
- - * empty, none, swap and file.
+ + * handle_pte_fault uses pte_present, pte_none and pte_file to find out the
+ + * pte type WITHOUT holding the page table lock. The _PAGE_PRESENT bit
+ + * is used to distinguish present from not-present ptes. It is changed only
+ + * with the page table lock held.
+ + *
+ + * The following table gives the different possible bit combinations for
+ + * the pte hardware and software bits in the last 12 bits of a pte:
    *
- - *                    irxt  ipte  irxt
- - * _PAGE_TYPE_EMPTY   1000   ->   1000
- - * _PAGE_TYPE_NONE    1001   ->   1001
- - * _PAGE_TYPE_SWAP    1011   ->   1011
- - * _PAGE_TYPE_FILE    11?1   ->   11?1
- - * _PAGE_TYPE_RO      0100   ->   1100
- - * _PAGE_TYPE_RW      0000   ->   1000
+ + *                            842100000000
+ + *                            000084210000
+ + *                            000000008421
+ + *                            .IR...wrdytp
+ + * empty                      .10...000000
+ + * swap                               .10...xxxx10
+ + * file                               .11...xxxxx0
+ + * prot-none, clean, old      .11...000001
+ + * prot-none, clean, young    .11...000101
+ + * prot-none, dirty, old      .10...001001
+ + * prot-none, dirty, young    .10...001101
+ + * read-only, clean, old      .11...010001
+ + * read-only, clean, young    .01...010101
+ + * read-only, dirty, old      .11...011001
+ + * read-only, dirty, young    .01...011101
+ + * read-write, clean, old     .11...110001
+ + * read-write, clean, young   .01...110101
+ + * read-write, dirty, old     .10...111001
+ + * read-write, dirty, young   .00...111101
    *
- - * pte_none is true for bits combinations 1000, 1010, 1100, 1110
- - * pte_present is true for bits combinations 0000, 0010, 0100, 0110, 1001
- - * pte_file is true for bits combinations 1101, 1111
- - * swap pte is 1011 and 0001, 0011, 0101, 0111 are invalid.
+ + * pte_present is true for the bit pattern .xx...xxxxx1, (pte & 0x001) == 0x001
+ + * pte_none    is true for the bit pattern .10...xxxx00, (pte & 0x603) == 0x400
+ + * pte_file    is true for the bit pattern .11...xxxxx0, (pte & 0x601) == 0x600
+ + * pte_swap    is true for the bit pattern .10...xxxx10, (pte & 0x603) == 0x402
    */
   
   #ifndef CONFIG_64BIT
@@@ -280,25 -286,14 +280,25 @@@
   #define _ASCE_TABLE_LENGTH    0x7f    /* 128 x 64 entries = 8k            */
   
   /* Bits in the segment table entry */
+ +#define _SEGMENT_ENTRY_BITS   0x7fffffffUL    /* Valid segment table bits */
   #define _SEGMENT_ENTRY_ORIGIN 0x7fffffc0UL    /* page table origin        */
- -#define _SEGMENT_ENTRY_RO     0x200   /* page protection bit              */
- -#define _SEGMENT_ENTRY_INV    0x20    /* invalid segment table entry      */
+ +#define _SEGMENT_ENTRY_PROTECT        0x200   /* page protection bit              */
+ +#define _SEGMENT_ENTRY_INVALID        0x20    /* invalid segment table entry      */
   #define _SEGMENT_ENTRY_COMMON 0x10    /* common segment bit               */
   #define _SEGMENT_ENTRY_PTL    0x0f    /* page table length                */
+ +#define _SEGMENT_ENTRY_NONE   _SEGMENT_ENTRY_PROTECT
   
   #define _SEGMENT_ENTRY                (_SEGMENT_ENTRY_PTL)
- -#define _SEGMENT_ENTRY_EMPTY  (_SEGMENT_ENTRY_INV)
+ +#define _SEGMENT_ENTRY_EMPTY  (_SEGMENT_ENTRY_INVALID)
+ +
+ +/*
+ + * Segment table entry encoding (I = invalid, R = read-only bit):
+ + *            ..R...I.....
+ + * prot-none  ..1...1.....
+ + * read-only  ..1...0.....
+ + * read-write ..0...0.....
+ + * empty      ..0...1.....
+ + */
   
   /* Page status table bits for virtualization */
   #define PGSTE_ACC_BITS        0xf0000000UL
@@@ -308,7 -303,9 +308,7 @@@
   #define PGSTE_HC_BIT  0x00200000UL
   #define PGSTE_GR_BIT  0x00040000UL
   #define PGSTE_GC_BIT  0x00020000UL
- -#define PGSTE_UR_BIT  0x00008000UL
- -#define PGSTE_UC_BIT  0x00004000UL    /* user dirty (migration) */
- -#define PGSTE_IN_BIT  0x00002000UL    /* IPTE notify bit */
+ +#define PGSTE_IN_BIT  0x00008000UL    /* IPTE notify bit */
   
   #else /* CONFIG_64BIT */
   
@@@ -327,8 -324,8 +327,8 @@@
   
   /* Bits in the region table entry */
   #define _REGION_ENTRY_ORIGIN  ~0xfffUL/* region/segment table origin      */
- -#define _REGION_ENTRY_RO      0x200   /* region protection bit            */
- -#define _REGION_ENTRY_INV     0x20    /* invalid region table entry       */
+ +#define _REGION_ENTRY_PROTECT 0x200   /* region protection bit            */
+ +#define _REGION_ENTRY_INVALID 0x20    /* invalid region table entry       */
   #define _REGION_ENTRY_TYPE_MASK       0x0c    /* region/segment table type mask   */
   #define _REGION_ENTRY_TYPE_R1 0x0c    /* region first table type          */
   #define _REGION_ENTRY_TYPE_R2 0x08    /* region second table type         */
@@@ -336,47 -333,29 +336,47 @@@
   #define _REGION_ENTRY_LENGTH  0x03    /* region third length              */
   
   #define _REGION1_ENTRY                (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_LENGTH)
- -#define _REGION1_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INV)
+ +#define _REGION1_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID)
   #define _REGION2_ENTRY                (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH)
- -#define _REGION2_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INV)
+ +#define _REGION2_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID)
   #define _REGION3_ENTRY                (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH)
- -#define _REGION3_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INV)
+ +#define _REGION3_ENTRY_EMPTY  (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID)
   
   #define _REGION3_ENTRY_LARGE  0x400   /* RTTE-format control, large page  */
   #define _REGION3_ENTRY_RO     0x200   /* page protection bit              */
   #define _REGION3_ENTRY_CO     0x100   /* change-recording override        */
   
   /* Bits in the segment table entry */
+ +#define _SEGMENT_ENTRY_BITS   0xfffffffffffffe33UL
+ +#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff1ff33UL
   #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address      */
   #define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* segment table origin             */
- -#define _SEGMENT_ENTRY_RO     0x200   /* page protection bit              */
- -#define _SEGMENT_ENTRY_INV    0x20    /* invalid segment table entry      */
+ +#define _SEGMENT_ENTRY_PROTECT        0x200   /* page protection bit              */
+ +#define _SEGMENT_ENTRY_INVALID        0x20    /* invalid segment table entry      */
   
   #define _SEGMENT_ENTRY                (0)
- -#define _SEGMENT_ENTRY_EMPTY  (_SEGMENT_ENTRY_INV)
+ +#define _SEGMENT_ENTRY_EMPTY  (_SEGMENT_ENTRY_INVALID)
   
   #define _SEGMENT_ENTRY_LARGE  0x400   /* STE-format control, large page   */
   #define _SEGMENT_ENTRY_CO     0x100   /* change-recording override   */
+ +#define _SEGMENT_ENTRY_SPLIT  0x001   /* THP splitting bit */
+ +#define _SEGMENT_ENTRY_YOUNG  0x002   /* SW segment young bit */
+ +#define _SEGMENT_ENTRY_NONE   _SEGMENT_ENTRY_YOUNG
+ +
+ +/*
+ + * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
+ + *                    ..R...I...y.
+ + * prot-none, old     ..0...1...1.
+ + * prot-none, young   ..1...1...1.
+ + * read-only, old     ..1...1...0.
+ + * read-only, young   ..1...0...1.
+ + * read-write, old    ..0...1...0.
+ + * read-write, young  ..0...0...1.
+ + * The segment table origin is used to distinguish empty (origin==0) from
+ + * read-write, old segment table entries (origin!=0)
+ + */
+ +
   #define _SEGMENT_ENTRY_SPLIT_BIT 0    /* THP splitting bit number */
- -#define _SEGMENT_ENTRY_SPLIT  (1UL << _SEGMENT_ENTRY_SPLIT_BIT)
   
   /* Set of bits not changed in pmd_modify */
   #define _SEGMENT_CHG_MASK     (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \
@@@ -390,7 -369,9 +390,7 @@@
   #define PGSTE_HC_BIT  0x0020000000000000UL
   #define PGSTE_GR_BIT  0x0004000000000000UL
   #define PGSTE_GC_BIT  0x0002000000000000UL
- -#define PGSTE_UR_BIT  0x0000800000000000UL
- -#define PGSTE_UC_BIT  0x0000400000000000UL    /* user dirty (migration) */
- -#define PGSTE_IN_BIT  0x0000200000000000UL    /* IPTE notify bit */
+ +#define PGSTE_IN_BIT  0x0000800000000000UL    /* IPTE notify bit */
   
   #endif /* CONFIG_64BIT */
   
@@@ -405,18 -386,14 +405,18 @@@
   /*
    * Page protection definitions.
    */
- -#define PAGE_NONE     __pgprot(_PAGE_TYPE_NONE)
- -#define PAGE_RO               __pgprot(_PAGE_TYPE_RO)
- -#define PAGE_RW               __pgprot(_PAGE_TYPE_RO | _PAGE_SWW)
- -#define PAGE_RWC      __pgprot(_PAGE_TYPE_RW | _PAGE_SWW | _PAGE_SWC)
- -
- -#define PAGE_KERNEL   PAGE_RWC
- -#define PAGE_SHARED   PAGE_KERNEL
- -#define PAGE_COPY     PAGE_RO
+ +#define PAGE_NONE     __pgprot(_PAGE_PRESENT | _PAGE_INVALID)
+ +#define PAGE_READ     __pgprot(_PAGE_PRESENT | _PAGE_READ | \
+ +                               _PAGE_INVALID | _PAGE_PROTECT)
+ +#define PAGE_WRITE    __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ +                               _PAGE_INVALID | _PAGE_PROTECT)
+ +
+ +#define PAGE_SHARED   __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ +                               _PAGE_YOUNG | _PAGE_DIRTY)
+ +#define PAGE_KERNEL   __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
+ +                               _PAGE_YOUNG | _PAGE_DIRTY)
+ +#define PAGE_KERNEL_RO        __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_YOUNG | \
+ +                               _PAGE_PROTECT)
   
   /*
    * On s390 the page table entry has an invalid bit and a read-only bit.
@@@ -425,31 -402,35 +425,31 @@@
    */
            /*xwr*/
   #define __P000        PAGE_NONE
- -#define __P001        PAGE_RO
- -#define __P010        PAGE_RO
- -#define __P011        PAGE_RO
- -#define __P100        PAGE_RO
- -#define __P101        PAGE_RO
- -#define __P110        PAGE_RO
- -#define __P111        PAGE_RO
+ +#define __P001        PAGE_READ
+ +#define __P010        PAGE_READ
+ +#define __P011        PAGE_READ
+ +#define __P100        PAGE_READ
+ +#define __P101        PAGE_READ
+ +#define __P110        PAGE_READ
+ +#define __P111        PAGE_READ
   
   #define __S000        PAGE_NONE
- -#define __S001        PAGE_RO
- -#define __S010        PAGE_RW
- -#define __S011        PAGE_RW
- -#define __S100        PAGE_RO
- -#define __S101        PAGE_RO
- -#define __S110        PAGE_RW
- -#define __S111        PAGE_RW
+ +#define __S001        PAGE_READ
+ +#define __S010        PAGE_WRITE
+ +#define __S011        PAGE_WRITE
+ +#define __S100        PAGE_READ
+ +#define __S101        PAGE_READ
+ +#define __S110        PAGE_WRITE
+ +#define __S111        PAGE_WRITE
   
   /*
    * Segment entry (large page) protection definitions.
    */
- -#define SEGMENT_NONE  __pgprot(_HPAGE_TYPE_NONE)
- -#define SEGMENT_RO    __pgprot(_HPAGE_TYPE_RO)
- -#define SEGMENT_RW    __pgprot(_HPAGE_TYPE_RW)
- -
- -static inline int mm_exclusive(struct mm_struct *mm)
- -{
- -      return likely(mm == current->active_mm &&
- -                    atomic_read(&mm->context.attach_count) <= 1);
- -}
+ +#define SEGMENT_NONE  __pgprot(_SEGMENT_ENTRY_INVALID | \
+ +                               _SEGMENT_ENTRY_NONE)
+ +#define SEGMENT_READ  __pgprot(_SEGMENT_ENTRY_INVALID | \
+ +                               _SEGMENT_ENTRY_PROTECT)
+ +#define SEGMENT_WRITE __pgprot(_SEGMENT_ENTRY_INVALID)
   
   static inline int mm_has_pgste(struct mm_struct *mm)
   {
@@@ -486,7 -467,7 +486,7 @@@ static inline int pgd_none(pgd_t pgd
   {
         if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2)
                 return 0;
- -      return (pgd_val(pgd) & _REGION_ENTRY_INV) != 0UL;
+ +      return (pgd_val(pgd) & _REGION_ENTRY_INVALID) != 0UL;
   }
   
   static inline int pgd_bad(pgd_t pgd)
@@@ -497,7 -478,7 +497,7 @@@
          * invalid for either table entry.
          */
         unsigned long mask =
- -              ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INV &
+ +              ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
                 ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
         return (pgd_val(pgd) & mask) != 0;
   }
@@@ -513,7 -494,7 +513,7 @@@ static inline int pud_none(pud_t pud
   {
         if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
                 return 0;
- -      return (pud_val(pud) & _REGION_ENTRY_INV) != 0UL;
+ +      return (pud_val(pud) & _REGION_ENTRY_INVALID) != 0UL;
   }
   
   static inline int pud_large(pud_t pud)
@@@ -531,7 -512,7 +531,7 @@@ static inline int pud_bad(pud_t pud
          * invalid for either table entry.
          */
         unsigned long mask =
- -              ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INV &
+ +              ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
                 ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
         return (pud_val(pud) & mask) != 0;
   }
@@@ -540,36 -521,30 +540,36 @@@
   
   static inline int pmd_present(pmd_t pmd)
   {
- -      unsigned long mask = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO;
- -      return (pmd_val(pmd) & mask) == _HPAGE_TYPE_NONE ||
- -             !(pmd_val(pmd) & _SEGMENT_ENTRY_INV);
+ +      return pmd_val(pmd) != _SEGMENT_ENTRY_INVALID;
   }
   
   static inline int pmd_none(pmd_t pmd)
   {
- -      return (pmd_val(pmd) & _SEGMENT_ENTRY_INV) &&
- -             !(pmd_val(pmd) & _SEGMENT_ENTRY_RO);
+ +      return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
   }
   
   static inline int pmd_large(pmd_t pmd)
   {
   #ifdef CONFIG_64BIT
- -      return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE);
+ +      return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
   #else
         return 0;
   #endif
   }
   
+ +static inline int pmd_prot_none(pmd_t pmd)
+ +{
+ +      return (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) &&
+ +              (pmd_val(pmd) & _SEGMENT_ENTRY_NONE);
+ +}
+ +
   static inline int pmd_bad(pmd_t pmd)
   {
- -      unsigned long mask = ~_SEGMENT_ENTRY_ORIGIN & ~_SEGMENT_ENTRY_INV;
- -      return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY;
+ +#ifdef CONFIG_64BIT
+ +      if (pmd_large(pmd))
+ +              return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
+ +#endif
+ +      return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
   }
   
   #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
@@@ -588,40 -563,31 +588,40 @@@ extern int pmdp_clear_flush_young(struc
   #define __HAVE_ARCH_PMD_WRITE
   static inline int pmd_write(pmd_t pmd)
   {
- -      return (pmd_val(pmd) & _SEGMENT_ENTRY_RO) == 0;
+ +      if (pmd_prot_none(pmd))
+ +              return 0;
+ +      return (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT) == 0;
   }
   
   static inline int pmd_young(pmd_t pmd)
   {
- -      return 0;
+ +      int young = 0;
+ +#ifdef CONFIG_64BIT
+ +      if (pmd_prot_none(pmd))
+ +              young = (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT) != 0;
+ +      else
+ +              young = (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
+ +#endif
+ +      return young;
   }
   
- -static inline int pte_none(pte_t pte)
+ +static inline int pte_present(pte_t pte)
   {
- -      return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT);
+ +      /* Bit pattern: (pte & 0x001) == 0x001 */
+ +      return (pte_val(pte) & _PAGE_PRESENT) != 0;
   }
   
- -static inline int pte_present(pte_t pte)
+ +static inline int pte_none(pte_t pte)
   {
- -      unsigned long mask = _PAGE_RO | _PAGE_INVALID | _PAGE_SWT | _PAGE_SWX;
- -      return (pte_val(pte) & mask) == _PAGE_TYPE_NONE ||
- -              (!(pte_val(pte) & _PAGE_INVALID) &&
- -               !(pte_val(pte) & _PAGE_SWT));
+ +      /* Bit pattern: pte == 0x400 */
+ +      return pte_val(pte) == _PAGE_INVALID;
   }
   
   static inline int pte_file(pte_t pte)
   {
- -      unsigned long mask = _PAGE_RO | _PAGE_INVALID | _PAGE_SWT;
- -      return (pte_val(pte) & mask) == _PAGE_TYPE_FILE;
+ +      /* Bit pattern: (pte & 0x601) == 0x600 */
+ +      return (pte_val(pte) & (_PAGE_INVALID | _PAGE_PROTECT | _PAGE_PRESENT))
+ +              == (_PAGE_INVALID | _PAGE_PROTECT);
   }
   
   static inline int pte_special(pte_t pte)
@@@ -668,15 -634,6 +668,15 @@@ static inline void pgste_set_unlock(pte
   #endif
   }
   
+ +static inline pgste_t pgste_get(pte_t *ptep)
+ +{
+ +      unsigned long pgste = 0;
+ +#ifdef CONFIG_PGSTE
+ +      pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
+ +#endif
+ +      return __pgste(pgste);
+ +}
+ +
   static inline void pgste_set(pte_t *ptep, pgste_t pgste)
   {
   #ifdef CONFIG_PGSTE
@@@ -687,28 -644,33 +687,28 @@@
   static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
   {
   #ifdef CONFIG_PGSTE
- -      unsigned long address, bits;
- -      unsigned char skey;
+ +      unsigned long address, bits, skey;
   
         if (pte_val(*ptep) & _PAGE_INVALID)
                 return pgste;
         address = pte_val(*ptep) & PAGE_MASK;
- -      skey = page_get_storage_key(address);
+ +      skey = (unsigned long) page_get_storage_key(address);
         bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
- -      /* Clear page changed & referenced bit in the storage key */
- -      if (bits & _PAGE_CHANGED)
+ +      if (!(pgste_val(pgste) & PGSTE_HC_BIT) && (bits & _PAGE_CHANGED)) {
+ +              /* Transfer dirty + referenced bit to host bits in pgste */
+ +              pgste_val(pgste) |= bits << 52;
                 page_set_storage_key(address, skey ^ bits, 0);
- -      else if (bits)
+ +      } else if (!(pgste_val(pgste) & PGSTE_HR_BIT) &&
+ +                 (bits & _PAGE_REFERENCED)) {
+ +              /* Transfer referenced bit to host bit in pgste */
+ +              pgste_val(pgste) |= PGSTE_HR_BIT;
                 page_reset_referenced(address);
+ +      }
         /* Transfer page changed & referenced bit to guest bits in pgste */
         pgste_val(pgste) |= bits << 48;         /* GR bit & GC bit */
- -      /* Get host changed & referenced bits from pgste */
- -      bits |= (pgste_val(pgste) & (PGSTE_HR_BIT | PGSTE_HC_BIT)) >> 52;
- -      /* Transfer page changed & referenced bit to kvm user bits */
- -      pgste_val(pgste) |= bits << 45;         /* PGSTE_UR_BIT & PGSTE_UC_BIT */
- -      /* Clear relevant host bits in pgste. */
- -      pgste_val(pgste) &= ~(PGSTE_HR_BIT | PGSTE_HC_BIT);
- -      pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
         /* Copy page access key and fetch protection bit to pgste */
- -      pgste_val(pgste) |=
- -              (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
- -      /* Transfer referenced bit to pte */
- -      pte_val(*ptep) |= (bits & _PAGE_REFERENCED) << 1;
+ +      pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ +      pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
   #endif
         return pgste;
   
@@@ -717,11 -679,24 +717,11 @@@
   static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
   {
   #ifdef CONFIG_PGSTE
- -      int young;
- -
         if (pte_val(*ptep) & _PAGE_INVALID)
                 return pgste;
         /* Get referenced bit from storage key */
- -      young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
- -      if (young)
- -              pgste_val(pgste) |= PGSTE_GR_BIT;
- -      /* Get host referenced bit from pgste */
- -      if (pgste_val(pgste) & PGSTE_HR_BIT) {
- -              pgste_val(pgste) &= ~PGSTE_HR_BIT;
- -              young = 1;
- -      }
- -      /* Transfer referenced bit to kvm user bits and pte */
- -      if (young) {
- -              pgste_val(pgste) |= PGSTE_UR_BIT;
- -              pte_val(*ptep) |= _PAGE_SWR;
- -      }
+ +      if (page_reset_referenced(pte_val(*ptep) & PAGE_MASK))
+ +              pgste_val(pgste) |= PGSTE_HR_BIT | PGSTE_GR_BIT;
   #endif
         return pgste;
   }
@@@ -748,13 -723,13 +748,13 @@@ static inline void pgste_set_key(pte_t 
   
   static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
   {
- -      if (!MACHINE_HAS_ESOP && (pte_val(entry) & _PAGE_SWW)) {
+ +      if (!MACHINE_HAS_ESOP && (pte_val(entry) & _PAGE_WRITE)) {
                 /*
                  * Without enhanced suppression-on-protection force
                  * the dirty bit on for all writable ptes.
                  */
- -              pte_val(entry) |= _PAGE_SWC;
- -              pte_val(entry) &= ~_PAGE_RO;
+ +              pte_val(entry) |= _PAGE_DIRTY;
+ +              pte_val(entry) &= ~_PAGE_PROTECT;
         }
         *ptep = entry;
   }
@@@ -866,17 -841,21 +866,17 @@@ static inline void set_pte_at(struct mm
    */
   static inline int pte_write(pte_t pte)
   {
- -      return (pte_val(pte) & _PAGE_SWW) != 0;
+ +      return (pte_val(pte) & _PAGE_WRITE) != 0;
   }
   
   static inline int pte_dirty(pte_t pte)
   {
- -      return (pte_val(pte) & _PAGE_SWC) != 0;
+ +      return (pte_val(pte) & _PAGE_DIRTY) != 0;
   }
   
   static inline int pte_young(pte_t pte)
   {
- -#ifdef CONFIG_PGSTE
- -      if (pte_val(pte) & _PAGE_SWR)
- -              return 1;
- -#endif
- -      return 0;
+ +      return (pte_val(pte) & _PAGE_YOUNG) != 0;
   }
   
   /*
@@@ -901,12 -880,12 +901,12 @@@ static inline void pud_clear(pud_t *pud
   
   static inline void pmd_clear(pmd_t *pmdp)
   {
- -      pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ +      pmd_val(*pmdp) = _SEGMENT_ENTRY_INVALID;
   }
   
   static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
   {
- -      pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ +      pte_val(*ptep) = _PAGE_INVALID;
   }
   
   /*
@@@ -917,63 -896,55 +917,63 @@@ static inline pte_t pte_modify(pte_t pt
   {
         pte_val(pte) &= _PAGE_CHG_MASK;
         pte_val(pte) |= pgprot_val(newprot);
- -      if ((pte_val(pte) & _PAGE_SWC) && (pte_val(pte) & _PAGE_SWW))
- -              pte_val(pte) &= ~_PAGE_RO;
+ +      /*
+ +       * newprot for PAGE_NONE, PAGE_READ and PAGE_WRITE has the
+ +       * invalid bit set, clear it again for readable, young pages
+ +       */
+ +      if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ))
+ +              pte_val(pte) &= ~_PAGE_INVALID;
+ +      /*
+ +       * newprot for PAGE_READ and PAGE_WRITE has the page protection
+ +       * bit set, clear it again for writable, dirty pages
+ +       */
+ +      if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE))
+ +              pte_val(pte) &= ~_PAGE_PROTECT;
         return pte;
   }
   
   static inline pte_t pte_wrprotect(pte_t pte)
   {
- -      pte_val(pte) &= ~_PAGE_SWW;
- -      /* Do not clobber _PAGE_TYPE_NONE pages!  */
- -      if (!(pte_val(pte) & _PAGE_INVALID))
- -              pte_val(pte) |= _PAGE_RO;
+ +      pte_val(pte) &= ~_PAGE_WRITE;
+ +      pte_val(pte) |= _PAGE_PROTECT;
         return pte;
   }
   
   static inline pte_t pte_mkwrite(pte_t pte)
   {
- -      pte_val(pte) |= _PAGE_SWW;
- -      if (pte_val(pte) & _PAGE_SWC)
- -              pte_val(pte) &= ~_PAGE_RO;
+ +      pte_val(pte) |= _PAGE_WRITE;
+ +      if (pte_val(pte) & _PAGE_DIRTY)
+ +              pte_val(pte) &= ~_PAGE_PROTECT;
         return pte;
   }
   
   static inline pte_t pte_mkclean(pte_t pte)
   {
- -      pte_val(pte) &= ~_PAGE_SWC;
- -      /* Do not clobber _PAGE_TYPE_NONE pages!  */
- -      if (!(pte_val(pte) & _PAGE_INVALID))
- -              pte_val(pte) |= _PAGE_RO;
+ +      pte_val(pte) &= ~_PAGE_DIRTY;
+ +      pte_val(pte) |= _PAGE_PROTECT;
         return pte;
   }
   
   static inline pte_t pte_mkdirty(pte_t pte)
   {
- -      pte_val(pte) |= _PAGE_SWC;
- -      if (pte_val(pte) & _PAGE_SWW)
- -              pte_val(pte) &= ~_PAGE_RO;
+ +      pte_val(pte) |= _PAGE_DIRTY;
+ +      if (pte_val(pte) & _PAGE_WRITE)
+ +              pte_val(pte) &= ~_PAGE_PROTECT;
         return pte;
   }
   
   static inline pte_t pte_mkold(pte_t pte)
   {
- -#ifdef CONFIG_PGSTE
- -      pte_val(pte) &= ~_PAGE_SWR;
- -#endif
+ +      pte_val(pte) &= ~_PAGE_YOUNG;
+ +      pte_val(pte) |= _PAGE_INVALID;
         return pte;
   }
   
   static inline pte_t pte_mkyoung(pte_t pte)
   {
+ +      pte_val(pte) |= _PAGE_YOUNG;
+ +      if (pte_val(pte) & _PAGE_READ)
+ +              pte_val(pte) &= ~_PAGE_INVALID;
         return pte;
   }
   
@@@ -986,7 -957,7 +986,7 @@@ static inline pte_t pte_mkspecial(pte_
   #ifdef CONFIG_HUGETLB_PAGE
   static inline pte_t pte_mkhuge(pte_t pte)
   {
- -      pte_val(pte) |= (_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_CO);
+ +      pte_val(pte) |= _PAGE_LARGE;
         return pte;
   }
   #endif
@@@ -1003,8 -974,8 +1003,8 @@@ static inline int ptep_test_and_clear_u
         if (mm_has_pgste(mm)) {
                 pgste = pgste_get_lock(ptep);
                 pgste = pgste_update_all(ptep, pgste);
- -              dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
- -              pgste_val(pgste) &= ~PGSTE_UC_BIT;
+ +              dirty = !!(pgste_val(pgste) & PGSTE_HC_BIT);
+ +              pgste_val(pgste) &= ~PGSTE_HC_BIT;
                 pgste_set_unlock(ptep, pgste);
                 return dirty;
         }
@@@ -1023,75 -994,59 +1023,75 @@@ static inline int ptep_test_and_clear_u
         if (mm_has_pgste(mm)) {
                 pgste = pgste_get_lock(ptep);
                 pgste = pgste_update_young(ptep, pgste);
- -              young = !!(pgste_val(pgste) & PGSTE_UR_BIT);
- -              pgste_val(pgste) &= ~PGSTE_UR_BIT;
+ +              young = !!(pgste_val(pgste) & PGSTE_HR_BIT);
+ +              pgste_val(pgste) &= ~PGSTE_HR_BIT;
                 pgste_set_unlock(ptep, pgste);
         }
         return young;
   }
   
+ +static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
+ +{
+ +      if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ +#ifndef CONFIG_64BIT
+ +              /* pto must point to the start of the segment table */
+ +              pte_t *pto = (pte_t *) (((unsigned long) ptep) & 0x7ffffc00);
+ +#else
+ +              /* ipte in zarch mode can do the math */
+ +              pte_t *pto = ptep;
+ +#endif
+ +              asm volatile(
+ +                      "       ipte    %2,%3"
+ +                      : "=m" (*ptep) : "m" (*ptep),
+ +                        "a" (pto), "a" (address));
+ +      }
+ +}
+ +
+ +static inline void ptep_flush_lazy(struct mm_struct *mm,
+ +                                 unsigned long address, pte_t *ptep)
+ +{
+ +      int active = (mm == current->active_mm) ? 1 : 0;
+ +
+ +      if (atomic_read(&mm->context.attach_count) > active)
+ +              __ptep_ipte(address, ptep);
+ +      else
+ +              mm->context.flush_mm = 1;
+ +}
+ +
   #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
   static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                             unsigned long addr, pte_t *ptep)
   {
         pgste_t pgste;
         pte_t pte;
+ +      int young;
   
         if (mm_has_pgste(vma->vm_mm)) {
                 pgste = pgste_get_lock(ptep);
- -              pgste = pgste_update_young(ptep, pgste);
- -              pte = *ptep;
- -              *ptep = pte_mkold(pte);
- -              pgste_set_unlock(ptep, pgste);
- -              return pte_young(pte);
+ +              pgste = pgste_ipte_notify(vma->vm_mm, addr, ptep, pgste);
         }
- -      return 0;
+ +
+ +      pte = *ptep;
+ +      __ptep_ipte(addr, ptep);
+ +      young = pte_young(pte);
+ +      pte = pte_mkold(pte);
+ +
+ +      if (mm_has_pgste(vma->vm_mm)) {
+ +              pgste_set_pte(ptep, pte);
+ +              pgste_set_unlock(ptep, pgste);
+ +      } else
+ +              *ptep = pte;
+ +
+ +      return young;
   }
   
   #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
   static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
                                          unsigned long address, pte_t *ptep)
   {
- -      /* No need to flush TLB
- -       * On s390 reference bits are in storage key and never in TLB
- -       * With virtualization we handle the reference bit, without we
- -       * we can simply return */
         return ptep_test_and_clear_young(vma, address, ptep);
   }
   
- -static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
- -{
- -      if (!(pte_val(*ptep) & _PAGE_INVALID)) {
- -#ifndef CONFIG_64BIT
- -              /* pto must point to the start of the segment table */
- -              pte_t *pto = (pte_t *) (((unsigned long) ptep) & 0x7ffffc00);
- -#else
- -              /* ipte in zarch mode can do the math */
- -              pte_t *pto = ptep;
- -#endif
- -              asm volatile(
- -                      "       ipte    %2,%3"
- -                      : "=m" (*ptep) : "m" (*ptep),
- -                        "a" (pto), "a" (address));
- -      }
- -}
- -
   /*
    * This is hard to understand. ptep_get_and_clear and ptep_clear_flush
    * both clear the TLB for the unmapped pte. The reason is that
@@@ -1112,14 -1067,16 +1112,14 @@@ static inline pte_t ptep_get_and_clear(
         pgste_t pgste;
         pte_t pte;
   
- -      mm->context.flush_mm = 1;
         if (mm_has_pgste(mm)) {
                 pgste = pgste_get_lock(ptep);
                 pgste = pgste_ipte_notify(mm, address, ptep, pgste);
         }
   
         pte = *ptep;
- -      if (!mm_exclusive(mm))
- -              __ptep_ipte(address, ptep);
- -      pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ +      ptep_flush_lazy(mm, address, ptep);
+ +      pte_val(*ptep) = _PAGE_INVALID;
   
         if (mm_has_pgste(mm)) {
                 pgste = pgste_update_all(&pte, pgste);
@@@ -1136,14 -1093,15 +1136,14 @@@ static inline pte_t ptep_modify_prot_st
         pgste_t pgste;
         pte_t pte;
   
- -      mm->context.flush_mm = 1;
         if (mm_has_pgste(mm)) {
                 pgste = pgste_get_lock(ptep);
                 pgste_ipte_notify(mm, address, ptep, pgste);
         }
   
         pte = *ptep;
- -      if (!mm_exclusive(mm))
- -              __ptep_ipte(address, ptep);
+ +      ptep_flush_lazy(mm, address, ptep);
+ +      pte_val(*ptep) |= _PAGE_INVALID;
   
         if (mm_has_pgste(mm)) {
                 pgste = pgste_update_all(&pte, pgste);
@@@ -1159,7 -1117,7 +1159,7 @@@ static inline void ptep_modify_prot_com
         pgste_t pgste;
   
         if (mm_has_pgste(mm)) {
- -              pgste = *(pgste_t *)(ptep + PTRS_PER_PTE);
+ +              pgste = pgste_get(ptep);
                 pgste_set_key(ptep, pgste, pte);
                 pgste_set_pte(ptep, pte);
                 pgste_set_unlock(ptep, pgste);
@@@ -1181,7 -1139,7 +1181,7 @@@ static inline pte_t ptep_clear_flush(st
   
         pte = *ptep;
         __ptep_ipte(address, ptep);
- -      pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ +      pte_val(*ptep) = _PAGE_INVALID;
   
         if (mm_has_pgste(vma->vm_mm)) {
                 pgste = pgste_update_all(&pte, pgste);
@@@ -1205,17 -1163,18 +1205,17 @@@ static inline pte_t ptep_get_and_clear_
         pgste_t pgste;
         pte_t pte;
   
- -      if (mm_has_pgste(mm)) {
+ +      if (!full && mm_has_pgste(mm)) {
                 pgste = pgste_get_lock(ptep);
- -              if (!full)
- -                      pgste = pgste_ipte_notify(mm, address, ptep, pgste);
+ +              pgste = pgste_ipte_notify(mm, address, ptep, pgste);
         }
   
         pte = *ptep;
         if (!full)
- -              __ptep_ipte(address, ptep);
- -      pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ +              ptep_flush_lazy(mm, address, ptep);
+ +      pte_val(*ptep) = _PAGE_INVALID;
   
- -      if (mm_has_pgste(mm)) {
+ +      if (!full && mm_has_pgste(mm)) {
                 pgste = pgste_update_all(&pte, pgste);
                 pgste_set_unlock(ptep, pgste);
         }
@@@ -1230,12 -1189,14 +1230,12 @@@ static inline pte_t ptep_set_wrprotect(
         pte_t pte = *ptep;
   
         if (pte_write(pte)) {
- -              mm->context.flush_mm = 1;
                 if (mm_has_pgste(mm)) {
                         pgste = pgste_get_lock(ptep);
                         pgste = pgste_ipte_notify(mm, address, ptep, pgste);
                 }
   
- -              if (!mm_exclusive(mm))
- -                      __ptep_ipte(address, ptep);
+ +              ptep_flush_lazy(mm, address, ptep);
                 pte = pte_wrprotect(pte);
   
                 if (mm_has_pgste(mm)) {
@@@ -1279,7 -1240,7 +1279,7 @@@ static inline pte_t mk_pte_phys(unsigne
   {
         pte_t __pte;
         pte_val(__pte) = physpage + pgprot_val(pgprot);
- -      return __pte;
+ +      return pte_mkyoung(__pte);
   }
   
   static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
@@@ -1287,8 -1248,10 +1287,8 @@@
         unsigned long physpage = page_to_phys(page);
         pte_t __pte = mk_pte_phys(physpage, pgprot);
   
- -      if ((pte_val(__pte) & _PAGE_SWW) && PageDirty(page)) {
- -              pte_val(__pte) |= _PAGE_SWC;
- -              pte_val(__pte) &= ~_PAGE_RO;
- -      }
+ +      if (pte_write(__pte) && PageDirty(page))
+ +              __pte = pte_mkdirty(__pte);
         return __pte;
   }
   
@@@ -1350,7 -1313,7 +1350,7 @@@ static inline void __pmd_idte(unsigned 
         unsigned long sto = (unsigned long) pmdp -
                             pmd_index(address) * sizeof(pmd_t);
   
- -      if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
+ +      if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)) {
                 asm volatile(
                         "       .insn   rrf,0xb98e0000,%2,%3,0,0"
                         : "=m" (*pmdp)
@@@ -1361,68 -1324,24 +1361,68 @@@
         }
   }
   
+ +static inline void __pmd_csp(pmd_t *pmdp)
+ +{
+ +      register unsigned long reg2 asm("2") = pmd_val(*pmdp);
+ +      register unsigned long reg3 asm("3") = pmd_val(*pmdp) |
+ +                                             _SEGMENT_ENTRY_INVALID;
+ +      register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5;
+ +
+ +      asm volatile(
+ +              "       csp %1,%3"
+ +              : "=m" (*pmdp)
+ +              : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
+ +}
+ +
   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
   static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
   {
         /*
- -       * pgprot is PAGE_NONE, PAGE_RO, or PAGE_RW (see __Pxxx / __Sxxx)
+ +       * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
          * Convert to segment table entry format.
          */
         if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
                 return pgprot_val(SEGMENT_NONE);
- -      if (pgprot_val(pgprot) == pgprot_val(PAGE_RO))
- -              return pgprot_val(SEGMENT_RO);
- -      return pgprot_val(SEGMENT_RW);
+ +      if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
+ +              return pgprot_val(SEGMENT_READ);
+ +      return pgprot_val(SEGMENT_WRITE);
+ +}
+ +
+ +static inline pmd_t pmd_mkyoung(pmd_t pmd)
+ +{
+ +#ifdef CONFIG_64BIT
+ +      if (pmd_prot_none(pmd)) {
+ +              pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
+ +      } else {
+ +              pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG;
+ +              pmd_val(pmd) &= ~_SEGMENT_ENTRY_INVALID;
+ +      }
+ +#endif
+ +      return pmd;
+ +}
+ +
+ +static inline pmd_t pmd_mkold(pmd_t pmd)
+ +{
+ +#ifdef CONFIG_64BIT
+ +      if (pmd_prot_none(pmd)) {
+ +              pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
+ +      } else {
+ +              pmd_val(pmd) &= ~_SEGMENT_ENTRY_YOUNG;
+ +              pmd_val(pmd) |= _SEGMENT_ENTRY_INVALID;
+ +      }
+ +#endif
+ +      return pmd;
   }
   
   static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
   {
+ +      int young;
+ +
+ +      young = pmd_young(pmd);
         pmd_val(pmd) &= _SEGMENT_CHG_MASK;
         pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+ +      if (young)
+ +              pmd = pmd_mkyoung(pmd);
         return pmd;
   }
   
@@@ -1430,18 -1349,29 +1430,29 @@@ static inline pmd_t mk_pmd_phys(unsigne
   {
         pmd_t __pmd;
         pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
- -      return __pmd;
+ +      return pmd_mkyoung(__pmd);
   }
   
   static inline pmd_t pmd_mkwrite(pmd_t pmd)
   {
- -      /* Do not clobber _HPAGE_TYPE_NONE pages! */
- -      if (!(pmd_val(pmd) & _SEGMENT_ENTRY_INV))
- -              pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO;
+ +      /* Do not clobber PROT_NONE segments! */
+ +      if (!pmd_prot_none(pmd))
+ +              pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
         return pmd;
   }
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
   
+ static inline void pmdp_flush_lazy(struct mm_struct *mm,
+                                  unsigned long address, pmd_t *pmdp)
+ {
+       int active = (mm == current->active_mm) ? 1 : 0;
+ 
+       if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
+               __pmd_idte(address, pmdp);
+       else
+               mm->context.flush_mm = 1;
+ }
+ 
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   
   #define __HAVE_ARCH_PGTABLE_DEPOSIT
@@@ -1459,7 -1389,7 +1470,7 @@@ static inline int pmd_trans_splitting(p
   static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                               pmd_t *pmdp, pmd_t entry)
   {
- -      if (!(pmd_val(entry) & _SEGMENT_ENTRY_INV) && MACHINE_HAS_EDAT1)
+ +      if (!(pmd_val(entry) & _SEGMENT_ENTRY_INVALID) && MACHINE_HAS_EDAT1)
                 pmd_val(entry) |= _SEGMENT_ENTRY_CO;
         *pmdp = entry;
   }
@@@ -1472,9 -1402,7 +1483,9 @@@ static inline pmd_t pmd_mkhuge(pmd_t pm
   
   static inline pmd_t pmd_wrprotect(pmd_t pmd)
   {
- -      pmd_val(pmd) |= _SEGMENT_ENTRY_RO;
+ +      /* Do not clobber PROT_NONE segments! */
+ +      if (!pmd_prot_none(pmd))
+ +              pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
         return pmd;
   }
   
@@@ -1484,16 -1412,50 +1495,16 @@@ static inline pmd_t pmd_mkdirty(pmd_t p
         return pmd;
   }
   
- -static inline pmd_t pmd_mkold(pmd_t pmd)
- -{
- -      /* No referenced bit in the segment table entry. */
- -      return pmd;
- -}
- -
- -static inline pmd_t pmd_mkyoung(pmd_t pmd)
- -{
- -      /* No referenced bit in the segment table entry. */
- -      return pmd;
- -}
- -
   #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
   static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                             unsigned long address, pmd_t *pmdp)
   {
- -      unsigned long pmd_addr = pmd_val(*pmdp) & HPAGE_MASK;
- -      long tmp, rc;
- -      int counter;
+ +      pmd_t pmd;
   
- -      rc = 0;
- -      if (MACHINE_HAS_RRBM) {
- -              counter = PTRS_PER_PTE >> 6;
- -              asm volatile(
- -                      "0:     .insn   rre,0xb9ae0000,%0,%3\n" /* rrbm */
- -                      "       ogr     %1,%0\n"
- -                      "       la      %3,0(%4,%3)\n"
- -                      "       brct    %2,0b\n"
- -                      : "=&d" (tmp), "+&d" (rc), "+d" (counter),
- -                        "+a" (pmd_addr)
- -                      : "a" (64 * 4096UL) : "cc");
- -              rc = !!rc;
- -      } else {
- -              counter = PTRS_PER_PTE;
- -              asm volatile(
- -                      "0:     rrbe    0,%2\n"
- -                      "       la      %2,0(%3,%2)\n"
- -                      "       brc     12,1f\n"
- -                      "       lhi     %0,1\n"
- -                      "1:     brct    %1,0b\n"
- -                      : "+d" (rc), "+d" (counter), "+a" (pmd_addr)
- -                      : "a" (4096UL) : "cc");
- -      }
- -      return rc;
+ +      pmd = *pmdp;
+ +      __pmd_idte(address, pmdp);
+ +      *pmdp = pmd_mkold(pmd);
+ +      return pmd_young(pmd);
   }
   
   #define __HAVE_ARCH_PMDP_GET_AND_CLEAR
@@@ -1559,8 -1521,10 +1570,8 @@@ static inline unsigned long pmd_pfn(pmd
    * exception will occur instead of a page translation exception. The
    * specifiation exception has the bad habit not to store necessary
    * information in the lowcore.
- - * Bit 21 and bit 22 are the page invalid bit and the page protection
- - * bit. We set both to indicate a swapped page.
- - * Bit 30 and 31 are used to distinguish the different page types. For
- - * a swapped page these bits need to be zero.
+ + * Bits 21, 22, 30 and 31 are used to indicate the page type.
+ + * A swap pte is indicated by bit pattern (pte & 0x603) == 0x402
    * This leaves the bits 1-19 and bits 24-29 to store type and offset.
    * We use the 5 bits from 25-29 for the type and the 20 bits from 1-19
    * plus 24 for the offset.
@@@ -1574,8 -1538,10 +1585,8 @@@
    * exception will occur instead of a page translation exception. The
    * specifiation exception has the bad habit not to store necessary
    * information in the lowcore.
- - * Bit 53 and bit 54 are the page invalid bit and the page protection
- - * bit. We set both to indicate a swapped page.
- - * Bit 62 and 63 are used to distinguish the different page types. For
- - * a swapped page these bits need to be zero.
+ + * Bits 53, 54, 62 and 63 are used to indicate the page type.
+ + * A swap pte is indicated by bit pattern (pte & 0x603) == 0x402
    * This leaves the bits 0-51 and bits 56-61 to store type and offset.
    * We use the 5 bits from 57-61 for the type and the 53 bits from 0-51
    * plus 56 for the offset.
@@@ -1592,7 -1558,7 +1603,7 @@@ static inline pte_t mk_swap_pte(unsigne
   {
         pte_t pte;
         offset &= __SWP_OFFSET_MASK;
- -      pte_val(pte) = _PAGE_TYPE_SWAP | ((type & 0x1f) << 2) |
+ +      pte_val(pte) = _PAGE_INVALID | _PAGE_TYPE | ((type & 0x1f) << 2) |
                 ((offset & 1UL) << 7) | ((offset & ~1UL) << 11);
         return pte;
   }
@@@ -1615,7 -1581,7 +1626,7 @@@
   
   #define pgoff_to_pte(__off) \
         ((pte_t) { ((((__off) & 0x7f) << 1) + (((__off) >> 7) << 12)) \
- -                 | _PAGE_TYPE_FILE })
+ +                 | _PAGE_INVALID | _PAGE_PROTECT })
   
   #endif /* !__ASSEMBLY__ */
   
diff --combined arch/s390/include/asm/processor.h

index b0e6435b2f02195e60303a9c46f0c199220c5225,83c85c217f5ca3faa6e1dcecbe58caabb90c6941..0eb37505cab11c71f083ed508f02c98a72127e95
--- 1/arch/s390/include/asm/processor.h
--- 2/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@@ -43,6 -43,7 +43,7 @@@ extern void execve_tail(void)
   #ifndef CONFIG_64BIT
   
   #define TASK_SIZE             (1UL << 31)
+ #define TASK_MAX_SIZE         (1UL << 31)
   #define TASK_UNMAPPED_BASE    (1UL << 30)
   
   #else /* CONFIG_64BIT */
@@@ -51,6 -52,7 +52,7 @@@
   #define TASK_UNMAPPED_BASE    (test_thread_flag(TIF_31BIT) ? \
                                         (1UL << 30) : (1UL << 41))
   #define TASK_SIZE             TASK_SIZE_OF(current)
+ #define TASK_MAX_SIZE         (1UL << 53)
   
   #endif /* CONFIG_64BIT */
   
@@@ -91,15 -93,7 +93,15 @@@ struct thread_struct 
   #endif
   };
   
- -#define PER_FLAG_NO_TE                1UL     /* Flag to disable transactions. */
+ +/* Flag to disable transactions. */
+ +#define PER_FLAG_NO_TE                        1UL
+ +/* Flag to enable random transaction aborts. */
+ +#define PER_FLAG_TE_ABORT_RAND                2UL
+ +/* Flag to specify random transaction abort mode:
+ + * - abort each transaction at a random instruction before TEND if set.
+ + * - abort random transactions at a random instruction if cleared.
+ + */
+ +#define PER_FLAG_TE_ABORT_RAND_TEND   4UL
   
   typedef struct thread_struct thread_struct;
   
diff --combined arch/s390/kvm/kvm-s390.c

index 34c1c9a90be288d9080d93373660d020bedf2a90,ac8e6670c551ccc7dc73cebb61c4dbe84b4cbac8..776dafe918db30b8c3f4823b8bf11c461bed6f5f
--- 1/arch/s390/kvm/kvm-s390.c
--- 2/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -28,6 -28,7 +28,7 @@@
   #include <asm/pgtable.h>
   #include <asm/nmi.h>
   #include <asm/switch_to.h>
+ #include <asm/facility.h>
   #include <asm/sclp.h>
   #include "kvm-s390.h"
   #include "gaccess.h"
@@@ -84,9 -85,15 +85,15 @@@ struct kvm_stats_debugfs_item debugfs_e
         { NULL }
   };
   
- static unsigned long long *facilities;
+ unsigned long *vfacilities;
   static struct gmap_notifier gmap_notifier;
   
+ /* test availability of vfacility */
+ static inline int test_vfacility(unsigned long nr)
+ {
+       return __test_facility(nr, (void *) vfacilities);
+ }
+ 
   /* Section: not file related */
   int kvm_arch_hardware_enable(void *garbage)
   {
@@@ -387,7 -394,7 +394,7 @@@ int kvm_arch_vcpu_setup(struct kvm_vcp
         vcpu->arch.sie_block->ecb   = 6;
         vcpu->arch.sie_block->ecb2  = 8;
         vcpu->arch.sie_block->eca   = 0xC1002001U;
-       vcpu->arch.sie_block->fac   = (int) (long) facilities;
+       vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
         hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
         tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
                      (unsigned long) vcpu);
@@@ -702,25 -709,14 +709,25 @@@ static int __vcpu_run(struct kvm_vcpu *
                 return rc;
   
         vcpu->arch.sie_block->icptcode = 0;
- -      preempt_disable();
- -      kvm_guest_enter();
- -      preempt_enable();
         VCPU_EVENT(vcpu, 6, "entering sie flags %x",
                    atomic_read(&vcpu->arch.sie_block->cpuflags));
         trace_kvm_s390_sie_enter(vcpu,
                                  atomic_read(&vcpu->arch.sie_block->cpuflags));
+ +
+ +      /*
+ +       * As PF_VCPU will be used in fault handler, between guest_enter
+ +       * and guest_exit should be no uaccess.
+ +       */
+ +      preempt_disable();
+ +      kvm_guest_enter();
+ +      preempt_enable();
         rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
+ +      kvm_guest_exit();
+ +
+ +      VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
+ +                 vcpu->arch.sie_block->icptcode);
+ +      trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
+ +
         if (rc > 0)
                 rc = 0;
         if (rc < 0) {
@@@ -732,6 -728,10 +739,6 @@@
                         rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                 }
         }
- -      VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
- -                 vcpu->arch.sie_block->icptcode);
- -      trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
- -      kvm_guest_exit();
   
         memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
         return rc;
@@@ -1063,6 -1063,10 +1070,10 @@@ int kvm_arch_create_memslot(struct kvm_
         return 0;
   }
   
+ void kvm_arch_memslots_updated(struct kvm *kvm)
+ {
+ }
+ 
   /* Section: memory related */
   int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_memory_slot *memslot,
@@@ -1129,20 -1133,20 +1140,20 @@@ static int __init kvm_s390_init(void
          * to hold the maximum amount of facilities. On the other hand, we
          * only set facilities that are known to work in KVM.
          */
-       facilities = (unsigned long long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
-       if (!facilities) {
+       vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
+       if (!vfacilities) {
                 kvm_exit();
                 return -ENOMEM;
         }
-       memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
-       facilities[0] &= 0xff82fff3f47c0000ULL;
-       facilities[1] &= 0x001c000000000000ULL;
+       memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
+       vfacilities[0] &= 0xff82fff3f47c0000UL;
+       vfacilities[1] &= 0x001c000000000000UL;
         return 0;
   }
   
   static void __exit kvm_s390_exit(void)
   {
-       free_page((unsigned long) facilities);
+       free_page((unsigned long) vfacilities);
         kvm_exit();
   }
   
diff --combined arch/s390/kvm/priv.c

index 4cdc54e63ebcb366786347de18437f327ee8e2b2,8f8d8ee9b1fb6e0ae53fa3c707c1ee48b11f968d..59200ee275e568ae99d593484b5575be320b79b8
--- 1/arch/s390/kvm/priv.c
--- 2/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@@ -16,7 -16,6 +16,7 @@@
   #include <linux/errno.h>
   #include <linux/compat.h>
   #include <asm/asm-offsets.h>
+ +#include <asm/facility.h>
   #include <asm/current.h>
   #include <asm/debug.h>
   #include <asm/ebcdic.h>
@@@ -164,8 -163,7 +164,7 @@@ static int handle_tpi(struct kvm_vcpu *
         kfree(inti);
   no_interrupt:
         /* Set condition code and we're done. */
-       vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-       vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
+       kvm_s390_set_psw_cc(vcpu, cc);
         return 0;
   }
   
@@@ -220,15 -218,13 +219,13 @@@ static int handle_io_inst(struct kvm_vc
                  * Set condition code 3 to stop the guest from issueing channel
                  * I/O instructions.
                  */
-               vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
-               vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
+               kvm_s390_set_psw_cc(vcpu, 3);
                 return 0;
         }
   }
   
   static int handle_stfl(struct kvm_vcpu *vcpu)
   {
-       unsigned int facility_list;
         int rc;
   
         vcpu->stat.instruction_stfl++;
@@@ -236,15 -232,13 +233,13 @@@
         if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
   
-       /* only pass the facility bits, which we can handle */
-       facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
- 
         rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
-                          &facility_list, sizeof(facility_list));
+                          vfacilities, 4);
         if (rc)
                 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
-       trace_kvm_s390_handle_stfl(vcpu, facility_list);
+       VCPU_EVENT(vcpu, 5, "store facility list value %x",
+                  *(unsigned int *) vfacilities);
+       trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
         return 0;
   }
   
@@@ -387,7 -381,7 +382,7 @@@ static int handle_stsi(struct kvm_vcpu 
                 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
   
         if (fc > 3) {
-               vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;     /* cc 3 */
+               kvm_s390_set_psw_cc(vcpu, 3);
                 return 0;
         }
   
@@@ -397,7 -391,7 +392,7 @@@
   
         if (fc == 0) {
                 vcpu->run->s.regs.gprs[0] = 3 << 28;
-               vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);  /* cc 0 */
+               kvm_s390_set_psw_cc(vcpu, 0);
                 return 0;
         }
   
@@@ -431,12 -425,11 +426,11 @@@
         }
         trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
         free_page(mem);
-       vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+       kvm_s390_set_psw_cc(vcpu, 0);
         vcpu->run->s.regs.gprs[0] = 0;
         return 0;
   out_no_data:
-       /* condition code 3 */
-       vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
+       kvm_s390_set_psw_cc(vcpu, 3);
   out_exception:
         free_page(mem);
         return rc;
@@@ -494,12 -487,12 +488,12 @@@ static int handle_epsw(struct kvm_vcpu 
         kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
   
         /* This basically extracts the mask half of the psw. */
-       vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
+       vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL;
         vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
         if (reg2) {
-               vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
+               vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL;
                 vcpu->run->s.regs.gprs[reg2] |=
-                       vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
+                       vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL;
         }
         return 0;
   }
@@@ -533,7 -526,8 +527,7 @@@ static int handle_pfmf(struct kvm_vcpu 
                 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
   
         /* Only provide non-quiescing support if the host supports it */
- -      if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ &&
- -          S390_lowcore.stfl_fac_list & 0x00020000)
+ +      if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && !test_facility(14))
                 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
   
         /* No support for conditional-SSKE */
diff --combined arch/s390/mm/pgtable.c

index 6d16132d08501bb40fd4598d4dae951601bd37b5,967d0bf1c059593002acff3aca0bcbfeaa3824bd..bf7c0dc64a76111d307b7b48585cc40ecb6741aa
--- 1/arch/s390/mm/pgtable.c
--- 2/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@@ -161,7 -161,7 +161,7 @@@ static int gmap_unlink_segment(struct g
         struct gmap_rmap *rmap;
         struct page *page;
   
- -      if (*table & _SEGMENT_ENTRY_INV)
+ +      if (*table & _SEGMENT_ENTRY_INVALID)
                 return 0;
         page = pfn_to_page(*table >> PAGE_SHIFT);
         mp = (struct gmap_pgtable *) page->index;
@@@ -172,7 -172,7 +172,7 @@@
                 kfree(rmap);
                 break;
         }
- -      *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
+ +      *table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT;
         return 1;
   }
   
@@@ -258,7 -258,7 +258,7 @@@ static int gmap_alloc_table(struct gma
                 return -ENOMEM;
         new = (unsigned long *) page_to_phys(page);
         crst_table_init(new, init);
- -      if (*table & _REGION_ENTRY_INV) {
+ +      if (*table & _REGION_ENTRY_INVALID) {
                 list_add(&page->lru, &gmap->crst_list);
                 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
                         (*table & _REGION_ENTRY_TYPE_MASK);
@@@ -292,22 -292,22 +292,22 @@@ int gmap_unmap_segment(struct gmap *gma
         for (off = 0; off < len; off += PMD_SIZE) {
                 /* Walk the guest addr space page table */
                 table = gmap->table + (((to + off) >> 53) & 0x7ff);
- -              if (*table & _REGION_ENTRY_INV)
+ +              if (*table & _REGION_ENTRY_INVALID)
                         goto out;
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + (((to + off) >> 42) & 0x7ff);
- -              if (*table & _REGION_ENTRY_INV)
+ +              if (*table & _REGION_ENTRY_INVALID)
                         goto out;
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + (((to + off) >> 31) & 0x7ff);
- -              if (*table & _REGION_ENTRY_INV)
+ +              if (*table & _REGION_ENTRY_INVALID)
                         goto out;
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + (((to + off) >> 20) & 0x7ff);
   
                 /* Clear segment table entry in guest address space. */
                 flush |= gmap_unlink_segment(gmap, table);
- -              *table = _SEGMENT_ENTRY_INV;
+ +              *table = _SEGMENT_ENTRY_INVALID;
         }
   out:
         spin_unlock(&gmap->mm->page_table_lock);
@@@ -335,7 -335,7 +335,7 @@@ int gmap_map_segment(struct gmap *gmap
   
         if ((from | to | len) & (PMD_SIZE - 1))
                 return -EINVAL;
-       if (len == 0 || from + len > PGDIR_SIZE ||
+       if (len == 0 || from + len > TASK_MAX_SIZE ||
             from + len < from || to + len < to)
                 return -EINVAL;
   
@@@ -345,17 -345,17 +345,17 @@@
         for (off = 0; off < len; off += PMD_SIZE) {
                 /* Walk the gmap address space page table */
                 table = gmap->table + (((to + off) >> 53) & 0x7ff);
- -              if ((*table & _REGION_ENTRY_INV) &&
+ +              if ((*table & _REGION_ENTRY_INVALID) &&
                     gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
                         goto out_unmap;
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + (((to + off) >> 42) & 0x7ff);
- -              if ((*table & _REGION_ENTRY_INV) &&
+ +              if ((*table & _REGION_ENTRY_INVALID) &&
                     gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
                         goto out_unmap;
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + (((to + off) >> 31) & 0x7ff);
- -              if ((*table & _REGION_ENTRY_INV) &&
+ +              if ((*table & _REGION_ENTRY_INVALID) &&
                     gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
                         goto out_unmap;
                 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
@@@ -363,8 -363,7 +363,8 @@@
   
                 /* Store 'from' address in an invalid segment table entry. */
                 flush |= gmap_unlink_segment(gmap, table);
- -              *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
+ +              *table =  (from + off) | (_SEGMENT_ENTRY_INVALID |
+ +                                        _SEGMENT_ENTRY_PROTECT);
         }
         spin_unlock(&gmap->mm->page_table_lock);
         up_read(&gmap->mm->mmap_sem);
@@@ -385,15 -384,15 +385,15 @@@ static unsigned long *gmap_table_walk(u
         unsigned long *table;
   
         table = gmap->table + ((address >> 53) & 0x7ff);
- -      if (unlikely(*table & _REGION_ENTRY_INV))
+ +      if (unlikely(*table & _REGION_ENTRY_INVALID))
                 return ERR_PTR(-EFAULT);
         table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
         table = table + ((address >> 42) & 0x7ff);
- -      if (unlikely(*table & _REGION_ENTRY_INV))
+ +      if (unlikely(*table & _REGION_ENTRY_INVALID))
                 return ERR_PTR(-EFAULT);
         table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
         table = table + ((address >> 31) & 0x7ff);
- -      if (unlikely(*table & _REGION_ENTRY_INV))
+ +      if (unlikely(*table & _REGION_ENTRY_INVALID))
                 return ERR_PTR(-EFAULT);
         table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
         table = table + ((address >> 20) & 0x7ff);
@@@ -423,11 -422,11 +423,11 @@@ unsigned long __gmap_translate(unsigne
                 return PTR_ERR(segment_ptr);
         /* Convert the gmap address to an mm address. */
         segment = *segment_ptr;
- -      if (!(segment & _SEGMENT_ENTRY_INV)) {
+ +      if (!(segment & _SEGMENT_ENTRY_INVALID)) {
                 page = pfn_to_page(segment >> PAGE_SHIFT);
                 mp = (struct gmap_pgtable *) page->index;
                 return mp->vmaddr | (address & ~PMD_MASK);
- -      } else if (segment & _SEGMENT_ENTRY_RO) {
+ +      } else if (segment & _SEGMENT_ENTRY_PROTECT) {
                 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
                 return vmaddr | (address & ~PMD_MASK);
         }
@@@ -518,8 -517,8 +518,8 @@@ static void gmap_disconnect_pgtable(str
         page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
         mp = (struct gmap_pgtable *) page->index;
         list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
- -              *rmap->entry =
- -                      _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
+ +              *rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID |
+ +                                           _SEGMENT_ENTRY_PROTECT);
                 list_del(&rmap->list);
                 kfree(rmap);
                 flush = 1;
@@@ -546,13 -545,13 +546,13 @@@ unsigned long __gmap_fault(unsigned lon
         /* Convert the gmap address to an mm address. */
         while (1) {
                 segment = *segment_ptr;
- -              if (!(segment & _SEGMENT_ENTRY_INV)) {
+ +              if (!(segment & _SEGMENT_ENTRY_INVALID)) {
                         /* Page table is present */
                         page = pfn_to_page(segment >> PAGE_SHIFT);
                         mp = (struct gmap_pgtable *) page->index;
                         return mp->vmaddr | (address & ~PMD_MASK);
                 }
- -              if (!(segment & _SEGMENT_ENTRY_RO))
+ +              if (!(segment & _SEGMENT_ENTRY_PROTECT))
                         /* Nothing mapped in the gmap address space. */
                         break;
                 rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
@@@ -587,25 -586,25 +587,25 @@@ void gmap_discard(unsigned long from, u
         while (address < to) {
                 /* Walk the gmap address space page table */
                 table = gmap->table + ((address >> 53) & 0x7ff);
- -              if (unlikely(*table & _REGION_ENTRY_INV)) {
+ +              if (unlikely(*table & _REGION_ENTRY_INVALID)) {
                         address = (address + PMD_SIZE) & PMD_MASK;
                         continue;
                 }
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + ((address >> 42) & 0x7ff);
- -              if (unlikely(*table & _REGION_ENTRY_INV)) {
+ +              if (unlikely(*table & _REGION_ENTRY_INVALID)) {
                         address = (address + PMD_SIZE) & PMD_MASK;
                         continue;
                 }
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + ((address >> 31) & 0x7ff);
- -              if (unlikely(*table & _REGION_ENTRY_INV)) {
+ +              if (unlikely(*table & _REGION_ENTRY_INVALID)) {
                         address = (address + PMD_SIZE) & PMD_MASK;
                         continue;
                 }
                 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
                 table = table + ((address >> 20) & 0x7ff);
- -              if (unlikely(*table & _SEGMENT_ENTRY_INV)) {
+ +              if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) {
                         address = (address + PMD_SIZE) & PMD_MASK;
                         continue;
                 }
@@@ -688,7 -687,7 +688,7 @@@ int gmap_ipte_notify(struct gmap *gmap
                         continue;
                 /* Set notification bit in the pgste of the pte */
                 entry = *ptep;
- -              if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) {
+ +              if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
                         pgste = pgste_get_lock(ptep);
                         pgste_val(pgste) |= PGSTE_IN_BIT;
                         pgste_set_unlock(ptep, pgste);
@@@ -732,6 -731,11 +732,11 @@@ void gmap_do_ipte_notify(struct mm_stru
         spin_unlock(&gmap_notifier_lock);
   }
   
+ static inline int page_table_with_pgste(struct page *page)
+ {
+       return atomic_read(&page->_mapcount) == 0;
+ }
+ 
   static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                     unsigned long vmaddr)
   {
@@@ -751,11 -755,10 +756,11 @@@
         mp->vmaddr = vmaddr & PMD_MASK;
         INIT_LIST_HEAD(&mp->mapper);
         page->index = (unsigned long) mp;
-       atomic_set(&page->_mapcount, 3);
+       atomic_set(&page->_mapcount, 0);
         table = (unsigned long *) page_to_phys(page);
- -      clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
- -      clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+ +      clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+ +      clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
+ +                  PAGE_SIZE/2);
         return table;
   }
   
@@@ -793,21 -796,26 +798,21 @@@ int set_guest_storage_key(struct mm_str
         pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
         pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
         if (!(pte_val(*ptep) & _PAGE_INVALID)) {
- -              unsigned long address, bits;
- -              unsigned char skey;
+ +              unsigned long address, bits, skey;
   
                 address = pte_val(*ptep) & PAGE_MASK;
- -              skey = page_get_storage_key(address);
+ +              skey = (unsigned long) page_get_storage_key(address);
                 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+ +              skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
                 /* Set storage key ACC and FP */
- -              page_set_storage_key(address,
- -                              (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)),
- -                              !nq);
- -
+ +              page_set_storage_key(address, skey, !nq);
                 /* Merge host changed & referenced into pgste  */
                 pgste_val(new) |= bits << 52;
- -              /* Transfer skey changed & referenced bit to kvm user bits */
- -              pgste_val(new) |= bits << 45;   /* PGSTE_UR_BIT & PGSTE_UC_BIT */
         }
         /* changing the guest storage key is considered a change of the page */
         if ((pgste_val(new) ^ pgste_val(old)) &
             (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
- -              pgste_val(new) |= PGSTE_UC_BIT;
+ +              pgste_val(new) |= PGSTE_HC_BIT;
   
         pgste_set_unlock(ptep, new);
         pte_unmap_unlock(*ptep, ptl);
@@@ -818,6 -826,11 +823,11 @@@ EXPORT_SYMBOL(set_guest_storage_key)
   
   #else /* CONFIG_PGSTE */
   
+ static inline int page_table_with_pgste(struct page *page)
+ {
+       return 0;
+ }
+ 
   static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                     unsigned long vmaddr)
   {
@@@ -875,7 -888,7 +885,7 @@@ unsigned long *page_table_alloc(struct 
                 pgtable_page_ctor(page);
                 atomic_set(&page->_mapcount, 1);
                 table = (unsigned long *) page_to_phys(page);
- -              clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
+ +              clear_table(table, _PAGE_INVALID, PAGE_SIZE);
                 spin_lock_bh(&mm->context.list_lock);
                 list_add(&page->lru, &mm->context.pgtable_list);
         } else {
@@@ -894,12 -907,12 +904,12 @@@ void page_table_free(struct mm_struct *
         struct page *page;
         unsigned int bit, mask;
   
-       if (mm_has_pgste(mm)) {
+       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+       if (page_table_with_pgste(page)) {
                 gmap_disconnect_pgtable(mm, table);
                 return page_table_free_pgste(table);
         }
         /* Free 1K/2K page table fragment of a 4K page */
-       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
         bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
         spin_lock_bh(&mm->context.list_lock);
         if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
@@@ -937,14 -950,14 +947,14 @@@ void page_table_free_rcu(struct mmu_gat
         unsigned int bit, mask;
   
         mm = tlb->mm;
-       if (mm_has_pgste(mm)) {
+       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+       if (page_table_with_pgste(page)) {
                 gmap_disconnect_pgtable(mm, table);
                 table = (unsigned long *) (__pa(table) | FRAG_MASK);
                 tlb_remove_table(tlb, table);
                 return;
         }
         bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
-       page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
         spin_lock_bh(&mm->context.list_lock);
         if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
                 list_del(&page->lru);
@@@ -1004,6 -1017,7 +1014,6 @@@ void tlb_table_flush(struct mmu_gather 
         struct mmu_table_batch **batch = &tlb->batch;
   
         if (*batch) {
- -              __tlb_flush_mm(tlb->mm);
                 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                 *batch = NULL;
         }
@@@ -1013,12 -1027,11 +1023,12 @@@ void tlb_remove_table(struct mmu_gathe
   {
         struct mmu_table_batch **batch = &tlb->batch;
   
+ +      tlb->mm->context.flush_mm = 1;
         if (*batch == NULL) {
                 *batch = (struct mmu_table_batch *)
                         __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                 if (*batch == NULL) {
- -                      __tlb_flush_mm(tlb->mm);
+ +                      __tlb_flush_mm_lazy(tlb->mm);
                         tlb_remove_table_one(table);
                         return;
                 }
@@@ -1026,40 -1039,124 +1036,124 @@@
         }
         (*batch)->tables[(*batch)->nr++] = table;
         if ((*batch)->nr == MAX_TABLE_BATCH)
- -              tlb_table_flush(tlb);
+ +              tlb_flush_mmu(tlb);
   }
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
- void thp_split_vma(struct vm_area_struct *vma)
+ static inline void thp_split_vma(struct vm_area_struct *vma)
   {
         unsigned long addr;
-       struct page *page;
   
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
-               page = follow_page(vma, addr, FOLL_SPLIT);
-       }
+       for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
+               follow_page(vma, addr, FOLL_SPLIT);
   }
   
- void thp_split_mm(struct mm_struct *mm)
+ static inline void thp_split_mm(struct mm_struct *mm)
   {
-       struct vm_area_struct *vma = mm->mmap;
+       struct vm_area_struct *vma;
   
-       while (vma != NULL) {
+       for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
                 thp_split_vma(vma);
                 vma->vm_flags &= ~VM_HUGEPAGE;
                 vma->vm_flags |= VM_NOHUGEPAGE;
-               vma = vma->vm_next;
         }
+       mm->def_flags |= VM_NOHUGEPAGE;
+ }
+ #else
+ static inline void thp_split_mm(struct mm_struct *mm)
+ {
   }
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   
+ static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
+                               struct mm_struct *mm, pud_t *pud,
+                               unsigned long addr, unsigned long end)
+ {
+       unsigned long next, *table, *new;
+       struct page *page;
+       pmd_t *pmd;
+ 
+       pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+ again:
+               if (pmd_none_or_clear_bad(pmd))
+                       continue;
+               table = (unsigned long *) pmd_deref(*pmd);
+               page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+               if (page_table_with_pgste(page))
+                       continue;
+               /* Allocate new page table with pgstes */
+               new = page_table_alloc_pgste(mm, addr);
+               if (!new) {
+                       mm->context.has_pgste = 0;
+                       continue;
+               }
+               spin_lock(&mm->page_table_lock);
+               if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
+                       /* Nuke pmd entry pointing to the "short" page table */
+                       pmdp_flush_lazy(mm, addr, pmd);
+                       pmd_clear(pmd);
+                       /* Copy ptes from old table to new table */
+                       memcpy(new, table, PAGE_SIZE/2);
+                       clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+                       /* Establish new table */
+                       pmd_populate(mm, pmd, (pte_t *) new);
+                       /* Free old table with rcu, there might be a walker! */
+                       page_table_free_rcu(tlb, table);
+                       new = NULL;
+               }
+               spin_unlock(&mm->page_table_lock);
+               if (new) {
+                       page_table_free_pgste(new);
+                       goto again;
+               }
+       } while (pmd++, addr = next, addr != end);
+ 
+       return addr;
+ }
+ 
+ static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
+                                  struct mm_struct *mm, pgd_t *pgd,
+                                  unsigned long addr, unsigned long end)
+ {
+       unsigned long next;
+       pud_t *pud;
+ 
+       pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+               if (pud_none_or_clear_bad(pud))
+                       continue;
+               next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
+       } while (pud++, addr = next, addr != end);
+ 
+       return addr;
+ }
+ 
+ static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
+                              unsigned long addr, unsigned long end)
+ {
+       unsigned long next;
+       pgd_t *pgd;
+ 
+       pgd = pgd_offset(mm, addr);
+       do {
+               next = pgd_addr_end(addr, end);
+               if (pgd_none_or_clear_bad(pgd))
+                       continue;
+               next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
+       } while (pgd++, addr = next, addr != end);
+ }
+ 
   /*
    * switch on pgstes for its userspace process (for kvm)
    */
   int s390_enable_sie(void)
   {
         struct task_struct *tsk = current;
-       struct mm_struct *mm, *old_mm;
+       struct mm_struct *mm = tsk->mm;
+       struct mmu_gather tlb;
   
         /* Do we have switched amode? If no, we cannot do sie */
         if (s390_user_mode == HOME_SPACE_MODE)
@@@ -1069,57 -1166,16 +1163,16 @@@
         if (mm_has_pgste(tsk->mm))
                 return 0;
   
-       /* lets check if we are allowed to replace the mm */
-       task_lock(tsk);
-       if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
- #ifdef CONFIG_AIO
-           !hlist_empty(&tsk->mm->ioctx_list) ||
- #endif
-           tsk->mm != tsk->active_mm) {
-               task_unlock(tsk);
-               return -EINVAL;
-       }
-       task_unlock(tsk);
- 
-       /* we copy the mm and let dup_mm create the page tables with_pgstes */
-       tsk->mm->context.alloc_pgste = 1;
-       /* make sure that both mms have a correct rss state */
-       sync_mm_rss(tsk->mm);
-       mm = dup_mm(tsk);
-       tsk->mm->context.alloc_pgste = 0;
-       if (!mm)
-               return -ENOMEM;
- 
- #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       down_write(&mm->mmap_sem);
         /* split thp mappings and disable thp for future mappings */
         thp_split_mm(mm);
-       mm->def_flags |= VM_NOHUGEPAGE;
- #endif
- 
-       /* Now lets check again if something happened */
-       task_lock(tsk);
-       if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
- #ifdef CONFIG_AIO
-           !hlist_empty(&tsk->mm->ioctx_list) ||
- #endif
-           tsk->mm != tsk->active_mm) {
-               mmput(mm);
-               task_unlock(tsk);
-               return -EINVAL;
-       }
- 
-       /* ok, we are alone. No ptrace, no threads, etc. */
-       old_mm = tsk->mm;
-       tsk->mm = tsk->active_mm = mm;
-       preempt_disable();
-       update_mm(mm, tsk);
-       atomic_inc(&mm->context.attach_count);
-       atomic_dec(&old_mm->context.attach_count);
-       cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-       preempt_enable();
-       task_unlock(tsk);
-       mmput(old_mm);
-       return 0;
+       /* Reallocate the page tables with pgstes */
+       mm->context.has_pgste = 1;
- -      tlb_gather_mmu(&tlb, mm, 0);
++      tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
+       page_table_realloc(&tlb, mm, 0, TASK_SIZE);
- -      tlb_finish_mmu(&tlb, 0, -1);
++      tlb_finish_mmu(&tlb, 0, TASK_SIZE);
+       up_write(&mm->mmap_sem);
+       return mm->context.has_pgste ? 0 : -ENOMEM;
   }
   EXPORT_SYMBOL_GPL(s390_enable_sie);
   
@@@ -1195,9 -1251,9 +1248,9 @@@ pgtable_t pgtable_trans_huge_withdraw(s
                 list_del(lh);
         }
         ptep = (pte_t *) pgtable;
- -      pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ +      pte_val(*ptep) = _PAGE_INVALID;
         ptep++;
- -      pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+ +      pte_val(*ptep) = _PAGE_INVALID;
         return pgtable;
   }
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --combined include/linux/sched.h

index f79ced7194355ef01f282d38759689e0b7ee3059,bfc809d51745ff3c57fcb610ecc92cf179156500..ce1e1c0aaa337fab9dda6de7f8f6d7529ca8a270
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -107,14 -107,6 +107,6 @@@ extern unsigned long this_cpu_load(void
   extern void calc_global_load(unsigned long ticks);
   extern void update_cpu_load_nohz(void);
   
- /* Notifier for when a task gets migrated to a new CPU */
- struct task_migration_notifier {
-       struct task_struct *task;
-       int from_cpu;
-       int to_cpu;
- };
- extern void register_task_migration_notifier(struct notifier_block *n);
- 
   extern unsigned long get_parent_ip(unsigned long addr);
   
   extern void dump_cpu_task(int cpu);
@@@ -1034,9 -1026,6 +1026,9 @@@ struct task_struct 
   #ifdef CONFIG_SMP
         struct llist_node wake_entry;
         int on_cpu;
+ +      struct task_struct *last_wakee;
+ +      unsigned long wakee_flips;
+ +      unsigned long wakee_flip_decay_ts;
   #endif
         int on_rq;
   
@@@ -1535,8 -1524,6 +1527,8 @@@ static inline pid_t task_pgrp_nr(struc
    * Test if a process is not yet dead (at most zombie state)
    * If pid_alive fails, then pointers within the task structure
    * can be stale and must not be dereferenced.
+ + *
+ + * Return: 1 if the process is alive. 0 otherwise.
    */
   static inline int pid_alive(struct task_struct *p)
   {
@@@ -1548,8 -1535,6 +1540,8 @@@
    * @tsk: Task structure to be checked.
    *
    * Check if a task structure is the first user space task the kernel created.
+ + *
+ + * Return: 1 if the task structure is init. 0 otherwise.
    */
   static inline int is_global_init(struct task_struct *tsk)
   {
@@@ -1635,7 -1620,6 +1627,7 @@@ extern void thread_group_cputime_adjust
   #define PF_MEMPOLICY  0x10000000      /* Non-default NUMA mempolicy */
   #define PF_MUTEX_TESTER       0x20000000      /* Thread belongs to the rt mutex tester */
   #define PF_FREEZER_SKIP       0x40000000      /* Freezer should not count it as freezable */
+ +#define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
   
   /*
    * Only the _current_ task can read/write to tsk->flags, but other
@@@ -1901,8 -1885,6 +1893,8 @@@ extern struct task_struct *idle_task(in
   /**
    * is_idle_task - is the specified task an idle task?
    * @p: the task in question.
+ + *
+ + * Return: 1 if @p is an idle task. 0 otherwise.
    */
   static inline bool is_idle_task(const struct task_struct *p)
   {
diff --combined kernel/sched/core.c

index 725aa067ad63c4d1a08f7e16975e625ee19c8c09,0efd2eefb027bdb269f2a63629fba4bc2ef8ded3..5ac63c9a995a3570e0ad73a20b28c23cb972a963
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -933,8 -933,6 +933,8 @@@ static int effective_prio(struct task_s
   /**
    * task_curr - is this task currently executing on a CPU?
    * @p: the task in question.
+ + *
+ + * Return: 1 if the task is currently executing. 0 otherwise.
    */
   inline int task_curr(const struct task_struct *p)
   {
@@@ -978,13 -976,6 +978,6 @@@ void check_preempt_curr(struct rq *rq, 
                 rq->skip_clock_update = 1;
   }
   
- static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
- 
- void register_task_migration_notifier(struct notifier_block *n)
- {
-       atomic_notifier_chain_register(&task_migration_notifier, n);
- }
- 
   #ifdef CONFIG_SMP
   void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
   {
@@@ -1015,18 -1006,10 +1008,10 @@@
         trace_sched_migrate_task(p, new_cpu);
   
         if (task_cpu(p) != new_cpu) {
-               struct task_migration_notifier tmn;
- 
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
                 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
- 
-               tmn.task = p;
-               tmn.from_cpu = task_cpu(p);
-               tmn.to_cpu = new_cpu;
- 
-               atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
         }
   
         __set_task_cpu(p, new_cpu);
@@@ -1484,7 -1467,7 +1469,7 @@@ static void ttwu_queue(struct task_stru
    * the simpler "current->state = TASK_RUNNING" to mark yourself
    * runnable without the overhead of this.
    *
- - * Returns %true if @p was woken up, %false if it was already running
+ + * Return: %true if @p was woken up, %false if it was already running.
    * or @state didn't match @p's state.
    */
   static int
@@@ -1493,13 -1476,7 +1478,13 @@@ try_to_wake_up(struct task_struct *p, u
         unsigned long flags;
         int cpu, success = 0;
   
- -      smp_wmb();
+ +      /*
+ +       * If we are going to wake up a thread waiting for CONDITION we
+ +       * need to ensure that CONDITION=1 done by the caller can not be
+ +       * reordered with p->state check below. This pairs with mb() in
+ +       * set_current_state() the waiting thread does.
+ +       */
+ +      smp_mb__before_spinlock();
         raw_spin_lock_irqsave(&p->pi_lock, flags);
         if (!(p->state & state))
                 goto out;
@@@ -1585,9 -1562,8 +1570,9 @@@ out
    * @p: The process to be woken up.
    *
    * Attempt to wake up the nominated process and move it to the set of runnable
- - * processes.  Returns 1 if the process was woken up, 0 if it was already
- - * running.
+ + * processes.
+ + *
+ + * Return: 1 if the process was woken up, 0 if it was already running.
    *
    * It may be assumed that this function implies a write memory barrier before
    * changing the task state if and only if any tasks are woken up.
@@@ -2200,8 -2176,6 +2185,8 @@@ void scheduler_tick(void
    * This makes sure that uptime, CFS vruntime, load
    * balancing, etc... continue to move forward, even
    * with a very low granularity.
+ + *
+ + * Return: Maximum deferment in nanoseconds.
    */
   u64 scheduler_tick_max_deferment(void)
   {
@@@ -2405,12 -2379,6 +2390,12 @@@ need_resched
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
   
+ +      /*
+ +       * Make sure that signal_pending_state()->signal_pending() below
+ +       * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ +       * done by the caller to avoid the race with signal_wake_up().
+ +       */
+ +      smp_mb__before_spinlock();
         raw_spin_lock_irq(&rq->lock);
   
         switch_count = &prev->nivcsw;
@@@ -2527,11 -2495,13 +2512,11 @@@ void __sched schedule_preempt_disabled(
    */
   asmlinkage void __sched notrace preempt_schedule(void)
   {
- -      struct thread_info *ti = current_thread_info();
- -
         /*
          * If there is a non-zero preempt_count or interrupts are disabled,
          * we do not want to preempt the current task. Just return..
          */
- -      if (likely(ti->preempt_count || irqs_disabled()))
+ +      if (likely(!preemptible()))
                 return;
   
         do {
@@@ -2675,7 -2645,7 +2660,7 @@@ void __wake_up_sync_key(wait_queue_head
         if (unlikely(!q))
                 return;
   
- -      if (unlikely(!nr_exclusive))
+ +      if (unlikely(nr_exclusive != 1))
                 wake_flags = 0;
   
         spin_lock_irqsave(&q->lock, flags);
@@@ -2811,8 -2781,8 +2796,8 @@@ EXPORT_SYMBOL(wait_for_completion)
    * specified timeout to expire. The timeout is in jiffies. It is not
    * interruptible.
    *
- - * The return value is 0 if timed out, and positive (at least 1, or number of
- - * jiffies left till timeout) if completed.
+ + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ + * till timeout) if completed.
    */
   unsigned long __sched
   wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@@ -2844,8 -2814,8 +2829,8 @@@ EXPORT_SYMBOL(wait_for_completion_io)
    * specified timeout to expire. The timeout is in jiffies. It is not
    * interruptible. The caller is accounted as waiting for IO.
    *
- - * The return value is 0 if timed out, and positive (at least 1, or number of
- - * jiffies left till timeout) if completed.
+ + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ + * till timeout) if completed.
    */
   unsigned long __sched
   wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@@ -2861,7 -2831,7 +2846,7 @@@ EXPORT_SYMBOL(wait_for_completion_io_ti
    * This waits for completion of a specific task to be signaled. It is
    * interruptible.
    *
- - * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ + * Return: -ERESTARTSYS if interrupted, 0 if completed.
    */
   int __sched wait_for_completion_interruptible(struct completion *x)
   {
@@@ -2880,8 -2850,8 +2865,8 @@@ EXPORT_SYMBOL(wait_for_completion_inter
    * This waits for either a completion of a specific task to be signaled or for a
    * specified timeout to expire. It is interruptible. The timeout is in jiffies.
    *
- - * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- - * positive (at least 1, or number of jiffies left till timeout) if completed.
+ + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ + * or number of jiffies left till timeout) if completed.
    */
   long __sched
   wait_for_completion_interruptible_timeout(struct completion *x,
@@@ -2898,7 -2868,7 +2883,7 @@@ EXPORT_SYMBOL(wait_for_completion_inter
    * This waits to be signaled for completion of a specific task. It can be
    * interrupted by a kill signal.
    *
- - * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ + * Return: -ERESTARTSYS if interrupted, 0 if completed.
    */
   int __sched wait_for_completion_killable(struct completion *x)
   {
@@@ -2918,8 -2888,8 +2903,8 @@@ EXPORT_SYMBOL(wait_for_completion_killa
    * signaled or for a specified timeout to expire. It can be
    * interrupted by a kill signal. The timeout is in jiffies.
    *
- - * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- - * positive (at least 1, or number of jiffies left till timeout) if completed.
+ + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ + * or number of jiffies left till timeout) if completed.
    */
   long __sched
   wait_for_completion_killable_timeout(struct completion *x,
@@@ -2933,7 -2903,7 +2918,7 @@@ EXPORT_SYMBOL(wait_for_completion_killa
    *    try_wait_for_completion - try to decrement a completion without blocking
    *    @x:     completion structure
    *
- - *    Returns: 0 if a decrement cannot be done without blocking
+ + *    Return: 0 if a decrement cannot be done without blocking
    *             1 if a decrement succeeded.
    *
    *    If a completion is being used as a counting completion,
@@@ -2960,7 -2930,7 +2945,7 @@@ EXPORT_SYMBOL(try_wait_for_completion)
    *    completion_done - Test to see if a completion has any waiters
    *    @x:     completion structure
    *
- - *    Returns: 0 if there are waiters (wait_for_completion() in progress)
+ + *    Return: 0 if there are waiters (wait_for_completion() in progress)
    *             1 if there are no waiters.
    *
    */
@@@ -3197,7 -3167,7 +3182,7 @@@ SYSCALL_DEFINE1(nice, int, increment
    * task_prio - return the priority value of a given task.
    * @p: the task in question.
    *
- - * This is the priority value as seen by users in /proc.
+ + * Return: The priority value as seen by users in /proc.
    * RT tasks are offset by -200. Normal tasks are centered
    * around 0, value goes from -16 to +15.
    */
@@@ -3209,8 -3179,6 +3194,8 @@@ int task_prio(const struct task_struct 
   /**
    * task_nice - return the nice value of a given task.
    * @p: the task in question.
+ + *
+ + * Return: The nice value [ -20 ... 0 ... 19 ].
    */
   int task_nice(const struct task_struct *p)
   {
@@@ -3221,8 -3189,6 +3206,8 @@@ EXPORT_SYMBOL(task_nice)
   /**
    * idle_cpu - is a given cpu idle currently?
    * @cpu: the processor in question.
+ + *
+ + * Return: 1 if the CPU is currently idle. 0 otherwise.
    */
   int idle_cpu(int cpu)
   {
@@@ -3245,8 -3211,6 +3230,8 @@@
   /**
    * idle_task - return the idle task for a given cpu.
    * @cpu: the processor in question.
+ + *
+ + * Return: The idle task for the cpu @cpu.
    */
   struct task_struct *idle_task(int cpu)
   {
@@@ -3256,8 -3220,6 +3241,8 @@@
   /**
    * find_process_by_pid - find a process with a matching PID value.
    * @pid: the pid in question.
+ + *
+ + * The task of @pid, if found. %NULL otherwise.
    */
   static struct task_struct *find_process_by_pid(pid_t pid)
   {
@@@ -3455,8 -3417,6 +3440,8 @@@ recheck
    * @policy: new policy.
    * @param: structure containing the new RT priority.
    *
+ + * Return: 0 on success. An error code otherwise.
+ + *
    * NOTE that the task may be already dead.
    */
   int sched_setscheduler(struct task_struct *p, int policy,
@@@ -3476,8 -3436,6 +3461,8 @@@ EXPORT_SYMBOL_GPL(sched_setscheduler)
    * current context has permission.  For example, this is needed in
    * stop_machine(): we create temporary high priority worker threads,
    * but our caller might not have that capability.
+ + *
+ + * Return: 0 on success. An error code otherwise.
    */
   int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                                const struct sched_param *param)
@@@ -3512,8 -3470,6 +3497,8 @@@ do_sched_setscheduler(pid_t pid, int po
    * @pid: the pid in question.
    * @policy: new policy.
    * @param: structure containing the new RT priority.
+ + *
+ + * Return: 0 on success. An error code otherwise.
    */
   SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
                 struct sched_param __user *, param)
@@@ -3529,8 -3485,6 +3514,8 @@@
    * sys_sched_setparam - set/change the RT priority of a thread
    * @pid: the pid in question.
    * @param: structure containing the new RT priority.
+ + *
+ + * Return: 0 on success. An error code otherwise.
    */
   SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
   {
@@@ -3540,9 -3494,6 +3525,9 @@@
   /**
    * sys_sched_getscheduler - get the policy (scheduling class) of a thread
    * @pid: the pid in question.
+ + *
+ + * Return: On success, the policy of the thread. Otherwise, a negative error
+ + * code.
    */
   SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
   {
@@@ -3569,9 -3520,6 +3554,9 @@@
    * sys_sched_getparam - get the RT priority of a thread
    * @pid: the pid in question.
    * @param: structure containing the RT priority.
+ + *
+ + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ + * code.
    */
   SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
   {
@@@ -3696,8 -3644,6 +3681,8 @@@ static int get_user_cpu_mask(unsigned l
    * @pid: pid of the process
    * @len: length in bytes of the bitmask pointed to by user_mask_ptr
    * @user_mask_ptr: user-space pointer to the new cpu mask
+ + *
+ + * Return: 0 on success. An error code otherwise.
    */
   SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
                 unsigned long __user *, user_mask_ptr)
@@@ -3749,8 -3695,6 +3734,8 @@@ out_unlock
    * @pid: pid of the process
    * @len: length in bytes of the bitmask pointed to by user_mask_ptr
    * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ + *
+ + * Return: 0 on success. An error code otherwise.
    */
   SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                 unsigned long __user *, user_mask_ptr)
@@@ -3785,8 -3729,6 +3770,8 @@@
    *
    * This function yields the current CPU to other tasks. If there are no
    * other threads running on this CPU then this function will return.
+ + *
+ + * Return: 0.
    */
   SYSCALL_DEFINE0(sched_yield)
   {
@@@ -3912,7 -3854,7 +3897,7 @@@ EXPORT_SYMBOL(yield)
    * It's the caller's job to ensure that the target task struct
    * can't go away on us before we can do any checks.
    *
- - * Returns:
+ + * Return:
    *    true (>0) if we indeed boosted the target task.
    *    false (0) if we failed to boost the target.
    *    -ESRCH if there's no task to yield to.
@@@ -4015,9 -3957,8 +4000,9 @@@ long __sched io_schedule_timeout(long t
    * sys_sched_get_priority_max - return maximum RT priority.
    * @policy: scheduling class.
    *
- - * this syscall returns the maximum rt_priority that can be used
- - * by a given scheduling class.
+ + * Return: On success, this syscall returns the maximum
+ + * rt_priority that can be used by a given scheduling class.
+ + * On failure, a negative error code is returned.
    */
   SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
   {
@@@ -4041,9 -3982,8 +4026,9 @@@
    * sys_sched_get_priority_min - return minimum RT priority.
    * @policy: scheduling class.
    *
- - * this syscall returns the minimum rt_priority that can be used
- - * by a given scheduling class.
+ + * Return: On success, this syscall returns the minimum
+ + * rt_priority that can be used by a given scheduling class.
+ + * On failure, a negative error code is returned.
    */
   SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
   {
@@@ -4069,9 -4009,6 +4054,9 @@@
    *
    * this syscall writes the default timeslice value of a given process
    * into the user-space timespec buffer. A value of '0' means infinity.
+ + *
+ + * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ + * an error code.
    */
   SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                 struct timespec __user *, interval)
@@@ -4181,7 -4118,7 +4166,7 @@@ void show_state_filter(unsigned long st
                 debug_show_all_locks();
   }
   
- -void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+ +void init_idle_bootup_task(struct task_struct *idle)
   {
         idle->sched_class = &idle_sched_class;
   }
@@@ -4194,7 -4131,7 +4179,7 @@@
    * NOTE: this function does not set the idle thread's NEED_RESCHED
    * flag, to make booting more robust.
    */
- -void __cpuinit init_idle(struct task_struct *idle, int cpu)
+ +void init_idle(struct task_struct *idle, int cpu)
   {
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
@@@ -4678,7 -4615,7 +4663,7 @@@ static void set_rq_offline(struct rq *r
    * migration_call - callback that gets triggered when a CPU is added.
    * Here we can start up the necessary migration thread for the new CPU.
    */
- -static int __cpuinit
+ +static int
   migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
   {
         int cpu = (long)hcpu;
@@@ -4732,12 -4669,12 +4717,12 @@@
    * happens before everything else.  This has to be lower priority than
    * the notifier in the perf_event subsystem, though.
    */
- -static struct notifier_block __cpuinitdata migration_notifier = {
+ +static struct notifier_block migration_notifier = {
         .notifier_call = migration_call,
         .priority = CPU_PRI_MIGRATION,
   };
   
- -static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+ +static int sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
   {
         switch (action & ~CPU_TASKS_FROZEN) {
@@@ -4750,7 -4687,7 +4735,7 @@@
         }
   }
   
- -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+ +static int sched_cpu_inactive(struct notifier_block *nfb,
                                         unsigned long action, void *hcpu)
   {
         switch (action & ~CPU_TASKS_FROZEN) {
@@@ -4962,8 -4899,7 +4947,8 @@@ sd_parent_degenerate(struct sched_domai
                                 SD_BALANCE_FORK |
                                 SD_BALANCE_EXEC |
                                 SD_SHARE_CPUPOWER |
- -                              SD_SHARE_PKG_RESOURCES);
+ +                              SD_SHARE_PKG_RESOURCES |
+ +                              SD_PREFER_SIBLING);
                 if (nr_node_ids == 1)
                         pflags &= ~SD_SERIALIZE;
         }
@@@ -5132,23 -5068,18 +5117,23 @@@ static void destroy_sched_domains(struc
    * two cpus are in the same cache domain, see cpus_share_cache().
    */
   DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+ +DEFINE_PER_CPU(int, sd_llc_size);
   DEFINE_PER_CPU(int, sd_llc_id);
   
   static void update_top_cache_domain(int cpu)
   {
         struct sched_domain *sd;
         int id = cpu;
+ +      int size = 1;
   
         sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- -      if (sd)
+ +      if (sd) {
                 id = cpumask_first(sched_domain_span(sd));
+ +              size = cpumask_weight(sched_domain_span(sd));
+ +      }
   
         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ +      per_cpu(sd_llc_size, cpu) = size;
         per_cpu(sd_llc_id, cpu) = id;
   }
   
@@@ -5172,13 -5103,6 +5157,13 @@@ cpu_attach_domain(struct sched_domain *
                         tmp->parent = parent->parent;
                         if (parent->parent)
                                 parent->parent->child = tmp;
+ +                      /*
+ +                       * Transfer SD_PREFER_SIBLING down in case of a
+ +                       * degenerate parent; the spans match for this
+ +                       * so the property transfers.
+ +                       */
+ +                      if (parent->flags & SD_PREFER_SIBLING)
+ +                              tmp->flags |= SD_PREFER_SIBLING;
                         destroy_sched_domain(parent, cpu);
                 } else
                         tmp = tmp->parent;
@@@ -6245,9 -6169,8 +6230,9 @@@ match1
                 ;
         }
   
+ +      n = ndoms_cur;
         if (doms_new == NULL) {
- -              ndoms_cur = 0;
+ +              n = 0;
                 doms_new = &fallback_doms;
                 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
                 WARN_ON_ONCE(dattr_new);
@@@ -6255,7 -6178,7 +6240,7 @@@
   
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
- -              for (j = 0; j < ndoms_cur && !new_topology; j++) {
+ +              for (j = 0; j < n && !new_topology; j++) {
                         if (cpumask_equal(doms_new[i], doms_cur[j])
                             && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
@@@ -6694,8 -6617,6 +6679,8 @@@ void normalize_rt_tasks(void
    * @cpu: the processor in question.
    *
    * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ + *
+ + * Return: The current task for @cpu.
    */
   struct task_struct *curr_task(int cpu)
   {
@@@ -6827,7 -6748,7 +6812,7 @@@ void sched_move_task(struct task_struc
         if (unlikely(running))
                 tsk->sched_class->put_prev_task(rq, tsk);
   
- -      tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ +      tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
                                 lockdep_is_held(&tsk->sighand->siglock)),
                           struct task_group, css);
         tg = autogroup_task_group(tsk, tg);
@@@ -7149,22 -7070,23 +7134,22 @@@ int sched_rt_handler(struct ctl_table *
   
   #ifdef CONFIG_CGROUP_SCHED
   
- -/* return corresponding task_group object of a cgroup */
- -static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+ +static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
   {
- -      return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
- -                          struct task_group, css);
+ +      return css ? container_of(css, struct task_group, css) : NULL;
   }
   
- -static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
+ +static struct cgroup_subsys_state *
+ +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
   {
- -      struct task_group *tg, *parent;
+ +      struct task_group *parent = css_tg(parent_css);
+ +      struct task_group *tg;
   
- -      if (!cgrp->parent) {
+ +      if (!parent) {
                 /* This is early initialization for the top cgroup */
                 return &root_task_group.css;
         }
   
- -      parent = cgroup_tg(cgrp->parent);
         tg = sched_create_group(parent);
         if (IS_ERR(tg))
                 return ERR_PTR(-ENOMEM);
@@@ -7172,38 -7094,41 +7157,38 @@@
         return &tg->css;
   }
   
- -static int cpu_cgroup_css_online(struct cgroup *cgrp)
+ +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
   {
- -      struct task_group *tg = cgroup_tg(cgrp);
- -      struct task_group *parent;
+ +      struct task_group *tg = css_tg(css);
+ +      struct task_group *parent = css_tg(css_parent(css));
   
- -      if (!cgrp->parent)
- -              return 0;
- -
- -      parent = cgroup_tg(cgrp->parent);
- -      sched_online_group(tg, parent);
+ +      if (parent)
+ +              sched_online_group(tg, parent);
         return 0;
   }
   
- -static void cpu_cgroup_css_free(struct cgroup *cgrp)
+ +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
   {
- -      struct task_group *tg = cgroup_tg(cgrp);
+ +      struct task_group *tg = css_tg(css);
   
         sched_destroy_group(tg);
   }
   
- -static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+ +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
   {
- -      struct task_group *tg = cgroup_tg(cgrp);
+ +      struct task_group *tg = css_tg(css);
   
         sched_offline_group(tg);
   }
   
- -static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+ +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
                                  struct cgroup_taskset *tset)
   {
         struct task_struct *task;
   
- -      cgroup_taskset_for_each(task, cgrp, tset) {
+ +      cgroup_taskset_for_each(task, css, tset) {
   #ifdef CONFIG_RT_GROUP_SCHED
- -              if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ +              if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
   #else
                 /* We don't support RT-tasks being in separate groups */
@@@ -7214,18 -7139,18 +7199,18 @@@
         return 0;
   }
   
- -static void cpu_cgroup_attach(struct cgroup *cgrp,
+ +static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
                               struct cgroup_taskset *tset)
   {
         struct task_struct *task;
   
- -      cgroup_taskset_for_each(task, cgrp, tset)
+ +      cgroup_taskset_for_each(task, css, tset)
                 sched_move_task(task);
   }
   
- -static void
- -cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
- -              struct task_struct *task)
+ +static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+ +                          struct cgroup_subsys_state *old_css,
+ +                          struct task_struct *task)
   {
         /*
          * cgroup_exit() is called in the copy_process() failure path.
@@@ -7239,16 -7164,15 +7224,16 @@@
   }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- -                              u64 shareval)
+ +static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ +                              struct cftype *cftype, u64 shareval)
   {
- -      return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+ +      return sched_group_set_shares(css_tg(css), scale_load(shareval));
   }
   
- -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+ +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ +                             struct cftype *cft)
   {
- -      struct task_group *tg = cgroup_tg(cgrp);
+ +      struct task_group *tg = css_tg(css);
   
         return (u64) scale_load_down(tg->shares);
   }
@@@ -7370,28 -7294,26 +7355,28 @@@ long tg_get_cfs_period(struct task_grou
         return cfs_period_us;
   }
   
- -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+ +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ +                                struct cftype *cft)
   {
- -      return tg_get_cfs_quota(cgroup_tg(cgrp));
+ +      return tg_get_cfs_quota(css_tg(css));
   }
   
- -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
- -                              s64 cfs_quota_us)
+ +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ +                                 struct cftype *cftype, s64 cfs_quota_us)
   {
- -      return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+ +      return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
   }
   
- -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+ +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ +                                 struct cftype *cft)
   {
- -      return tg_get_cfs_period(cgroup_tg(cgrp));
+ +      return tg_get_cfs_period(css_tg(css));
   }
   
- -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- -                              u64 cfs_period_us)
+ +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ +                                  struct cftype *cftype, u64 cfs_period_us)
   {
- -      return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+ +      return tg_set_cfs_period(css_tg(css), cfs_period_us);
   }
   
   struct cfs_schedulable_data {
@@@ -7472,10 -7394,10 +7457,10 @@@ static int __cfs_schedulable(struct tas
         return ret;
   }
   
- -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ +static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
                 struct cgroup_map_cb *cb)
   {
- -      struct task_group *tg = cgroup_tg(cgrp);
+ +      struct task_group *tg = css_tg(css);
         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   
         cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@@ -7488,28 -7410,26 +7473,28 @@@
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
- -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
- -                              s64 val)
+ +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ +                              struct cftype *cft, s64 val)
   {
- -      return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ +      return sched_group_set_rt_runtime(css_tg(css), val);
   }
   
- -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+ +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ +                             struct cftype *cft)
   {
- -      return sched_group_rt_runtime(cgroup_tg(cgrp));
+ +      return sched_group_rt_runtime(css_tg(css));
   }
   
- -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- -              u64 rt_period_us)
+ +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ +                                  struct cftype *cftype, u64 rt_period_us)
   {
- -      return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+ +      return sched_group_set_rt_period(css_tg(css), rt_period_us);
   }
   
- -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+ +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ +                                 struct cftype *cft)
   {
- -      return sched_group_rt_period(cgroup_tg(cgrp));
+ +      return sched_group_rt_period(css_tg(css));
   }
   #endif /* CONFIG_RT_GROUP_SCHED */
author	Linus Torvalds <[email protected]>
	Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
committer	Linus Torvalds <[email protected]>
	Thu, 5 Sep 2013 01:15:06 +0000 (18:15 -0700)
		1	2
arch/arm/configs/keystone_defconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/configs/omap2plus_defconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/configs/tegra_defconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/asm-offsets.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/mmu_context.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/kvm-s390.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/priv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/mm/pgtable.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history