]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)
Pull KVM updates from Marcelo Tosatti:
 "KVM updates for the 3.9 merge window, including x86 real mode
  emulation fixes, stronger memory slot interface restrictions, mmu_lock
  spinlock hold time reduction, improved handling of large page faults
  on shadow, initial APICv HW acceleration support, s390 channel IO
  based virtio, amongst others"

* tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits)
  Revert "KVM: MMU: lazily drop large spte"
  x86: pvclock kvm: align allocation size to page size
  KVM: nVMX: Remove redundant get_vmcs12 from nested_vmx_exit_handled_msr
  x86 emulator: fix parity calculation for AAD instruction
  KVM: PPC: BookE: Handle alignment interrupts
  booke: Added DBCR4 SPR number
  KVM: PPC: booke: Allow multiple exception types
  KVM: PPC: booke: use vcpu reference from thread_struct
  KVM: Remove user_alloc from struct kvm_memory_slot
  KVM: VMX: disable apicv by default
  KVM: s390: Fix handling of iscs.
  KVM: MMU: cleanup __direct_map
  KVM: MMU: remove pt_access in mmu_set_spte
  KVM: MMU: cleanup mapping-level
  KVM: MMU: lazily drop large spte
  KVM: VMX: cleanup vmx_set_cr0().
  KVM: VMX: add missing exit names to VMX_EXIT_REASONS array
  KVM: VMX: disable SMEP feature when guest is in non-paging mode
  KVM: Remove duplicate text in api.txt
  Revert "KVM: MMU: split kvm_mmu_free_page"
  ...

20 files changed:
1  2 
Documentation/virtual/kvm/api.txt
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/reg.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/emulate.c
arch/s390/include/asm/irq.h
arch/s390/kernel/irq.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/kvmclock.c
arch/x86/kvm/x86.c
drivers/s390/kvm/kvm_virtio.c
drivers/s390/kvm/virtio_ccw.c
include/linux/kvm_host.h
include/uapi/linux/kvm.h
kernel/sched/core.c

index e0fa0ea2b1870f17291acc9890c47150f1e092f5,c2534c300a45489bc3d3cd2ecc8b984553f91fe4..119358dfb74295af7e95c6ecdcda35e9cf1d8f17
@@@ -219,19 -219,6 +219,6 @@@ allocation of vcpu ids.  For example, i
  single-threaded guest vcpus, it should make all vcpu ids be a multiple
  of the number of vcpus per vcore.
  
- On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
- threads in one or more virtual CPU cores.  (This is because the
- hardware requires all the hardware threads in a CPU core to be in the
- same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
- of vcpus per virtual core (vcore).  The vcore id is obtained by
- dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
- given vcore will always be in the same physical core as each other
- (though that might be a different physical core from time to time).
- Userspace can control the threading (SMT) mode of the guest by its
- allocation of vcpu ids.  For example, if userspace wants
- single-threaded guest vcpus, it should make all vcpu ids be a multiple
- of the number of vcpus per vcore.
  For virtual cpus that have been created with S390 user controlled virtual
  machines, the resulting vcpu fd can be memory mapped at page offset
  KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
@@@ -293,7 -280,7 +280,7 @@@ kvm_run' (see below)
  4.11 KVM_GET_REGS
  
  Capability: basic
 -Architectures: all
 +Architectures: all except ARM
  Type: vcpu ioctl
  Parameters: struct kvm_regs (out)
  Returns: 0 on success, -1 on error
@@@ -314,7 -301,7 +301,7 @@@ struct kvm_regs 
  4.12 KVM_SET_REGS
  
  Capability: basic
 -Architectures: all
 +Architectures: all except ARM
  Type: vcpu ioctl
  Parameters: struct kvm_regs (in)
  Returns: 0 on success, -1 on error
@@@ -345,7 -332,7 +332,7 @@@ struct kvm_sregs 
        __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
  };
  
- /* ppc -- see arch/powerpc/include/asm/kvm.h */
+ /* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */
  
  interrupt_bitmap is a bitmap of pending external interrupts.  At most
  one bit may be set.  This interrupt has been acknowledged by the APIC
@@@ -600,7 -587,7 +587,7 @@@ struct kvm_fpu 
  4.24 KVM_CREATE_IRQCHIP
  
  Capability: KVM_CAP_IRQCHIP
 -Architectures: x86, ia64
 +Architectures: x86, ia64, ARM
  Type: vm ioctl
  Parameters: none
  Returns: 0 on success, -1 on error
  Creates an interrupt controller model in the kernel.  On x86, creates a virtual
  ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
  local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
 -only go to the IOAPIC.  On ia64, a IOSAPIC is created.
 +only go to the IOAPIC.  On ia64, a IOSAPIC is created. On ARM, a GIC is
 +created.
  
  
  4.25 KVM_IRQ_LINE
  
  Capability: KVM_CAP_IRQCHIP
 -Architectures: x86, ia64
 +Architectures: x86, ia64, arm
  Type: vm ioctl
  Parameters: struct kvm_irq_level
  Returns: 0 on success, -1 on error
  
  Sets the level of a GSI input to the interrupt controller model in the kernel.
 -Requires that an interrupt controller model has been previously created with
 -KVM_CREATE_IRQCHIP.  Note that edge-triggered interrupts require the level
 -to be set to 1 and then back to 0.
 +On some architectures it is required that an interrupt controller model has
 +been previously created with KVM_CREATE_IRQCHIP.  Note that edge-triggered
 +interrupts require the level to be set to 1 and then back to 0.
 +
 +ARM can signal an interrupt either at the CPU level, or at the in-kernel irqchip
 +(GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated for
 +specific cpus.  The irq field is interpreted like this:
 +
 +  bits:  | 31 ... 24 | 23  ... 16 | 15    ...    0 |
 +  field: | irq_type  | vcpu_index |     irq_id     |
 +
 +The irq_type field has the following values:
 +- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ
 +- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.)
 +               (the vcpu_index field is ignored)
 +- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.)
 +
 +(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
 +
 +In both cases, level is used to raise/lower the line.
  
  struct kvm_irq_level {
        union {
@@@ -892,12 -861,12 +879,12 @@@ It is recommended that the lower 21 bit
  be identical.  This allows large pages in the guest to be backed by large
  pages in the host.
  
- The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs
- kvm to keep track of writes to memory within the slot.  See KVM_GET_DIRTY_LOG
- ioctl.  The KVM_CAP_READONLY_MEM capability indicates the availability of the
- KVM_MEM_READONLY flag.  When this flag is set for a memory region, KVM only
- allows read accesses.  Writes will be posted to userspace as KVM_EXIT_MMIO
- exits.
+ The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
+ KVM_MEM_READONLY.  The former can be set to instruct KVM to keep track of
+ writes to memory within the slot.  See KVM_GET_DIRTY_LOG ioctl to know how to
+ use it.  The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
+ to make a new slot read-only.  In this case, writes to this memory will be
posted to userspace as KVM_EXIT_MMIO exits.
  
  When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
  the memory region are automatically reflected into the guest.  For example, an
@@@ -931,7 -900,7 +918,7 @@@ documentation when it pops into existen
  4.37 KVM_ENABLE_CAP
  
  Capability: KVM_CAP_ENABLE_CAP
- Architectures: ppc
+ Architectures: ppc, s390
  Type: vcpu ioctl
  Parameters: struct kvm_enable_cap (in)
  Returns: 0 on success; -1 on error
@@@ -1792,28 -1761,8 +1779,29 @@@ registers, find a list below
    PPC   | KVM_REG_PPC_VPA_SLB   | 128
    PPC   | KVM_REG_PPC_VPA_DTL   | 128
    PPC   | KVM_REG_PPC_EPCR    | 32
+   PPC   | KVM_REG_PPC_EPR     | 32
  
 +ARM registers are mapped using the lower 32 bits.  The upper 16 of that
 +is the register group type, or coprocessor number:
 +
 +ARM core registers have the following id bit patterns:
 +  0x4002 0000 0010 <index into the kvm_regs struct:16>
 +
 +ARM 32-bit CP15 registers have the following id bit patterns:
 +  0x4002 0000 000F <zero:1> <crn:4> <crm:4> <opc1:4> <opc2:3>
 +
 +ARM 64-bit CP15 registers have the following id bit patterns:
 +  0x4003 0000 000F <zero:1> <zero:4> <crm:4> <opc1:4> <zero:3>
 +
 +ARM CCSIDR registers are demultiplexed by CSSELR value:
 +  0x4002 0000 0011 00 <csselr:8>
 +
 +ARM 32-bit VFP control registers have the following id bit patterns:
 +  0x4002 0000 0012 1 <regno:12>
 +
 +ARM 64-bit FP registers have the following id bit patterns:
 +  0x4002 0000 0012 0 <regno:12>
 +
  4.69 KVM_GET_ONE_REG
  
  Capability: KVM_CAP_ONE_REG
@@@ -2108,6 -2057,14 +2096,14 @@@ KVM_S390_INT_VIRTIO (vm) - virtio exter
  KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
  KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
  KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
+ KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an
+     I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel);
+     I/O interruption parameters in parm (subchannel) and parm64 (intparm,
+     interruption subclass)
+ KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm,
+                            machine check interrupt code in parm64 (note that
+                            machine checks needing further payload are not
+                            supported by this ioctl)
  
  Note that the vcpu ioctl is asynchronous to vcpu execution.
  
@@@ -2166,88 -2123,6 +2162,88 @@@ written, then `n_invalid' invalid entri
  valid entries found.
  
  
 +4.77 KVM_ARM_VCPU_INIT
 +
 +Capability: basic
 +Architectures: arm
 +Type: vcpu ioctl
 +Parameters: struct struct kvm_vcpu_init (in)
 +Returns: 0 on success; -1 on error
 +Errors:
 +  EINVAL:    the target is unknown, or the combination of features is invalid.
 +  ENOENT:    a features bit specified is unknown.
 +
 +This tells KVM what type of CPU to present to the guest, and what
 +optional features it should have.  This will cause a reset of the cpu
 +registers to their initial values.  If this is not called, KVM_RUN will
 +return ENOEXEC for that vcpu.
 +
 +Note that because some registers reflect machine topology, all vcpus
 +should be created before this ioctl is invoked.
 +
 +Possible features:
 +      - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
 +        Depends on KVM_CAP_ARM_PSCI.
 +
 +
 +4.78 KVM_GET_REG_LIST
 +
 +Capability: basic
 +Architectures: arm
 +Type: vcpu ioctl
 +Parameters: struct kvm_reg_list (in/out)
 +Returns: 0 on success; -1 on error
 +Errors:
 +  E2BIG:     the reg index list is too big to fit in the array specified by
 +             the user (the number required will be written into n).
 +
 +struct kvm_reg_list {
 +      __u64 n; /* number of registers in reg[] */
 +      __u64 reg[0];
 +};
 +
 +This ioctl returns the guest registers that are supported for the
 +KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
 +
 +
 +4.80 KVM_ARM_SET_DEVICE_ADDR
 +
 +Capability: KVM_CAP_ARM_SET_DEVICE_ADDR
 +Architectures: arm
 +Type: vm ioctl
 +Parameters: struct kvm_arm_device_address (in)
 +Returns: 0 on success, -1 on error
 +Errors:
 +  ENODEV: The device id is unknown
 +  ENXIO:  Device not supported on current system
 +  EEXIST: Address already set
 +  E2BIG:  Address outside guest physical address space
 +  EBUSY:  Address overlaps with other device range
 +
 +struct kvm_arm_device_addr {
 +      __u64 id;
 +      __u64 addr;
 +};
 +
 +Specify a device address in the guest's physical address space where guests
 +can access emulated or directly exposed devices, which the host kernel needs
 +to know about. The id field is an architecture specific identifier for a
 +specific device.
 +
 +ARM divides the id field into two parts, a device id and an address type id
 +specific to the individual device.
 +
 +  bits:  | 63        ...       32 | 31    ...    16 | 15    ...    0 |
 +  field: |        0x00000000      |     device id   |  addr type id  |
 +
 +ARM currently only require this when using the in-kernel GIC support for the
 +hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 as the device id.  When
 +setting the base address for the guest's mapping of the VGIC virtual CPU
 +and distributor interface, the ioctl must be called after calling
 +KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs.  Calling
 +this ioctl twice for any of the base addresses will return -EEXIST.
 +
 +
  5. The kvm_run structure
  ------------------------
  
@@@ -2359,8 -2234,8 +2355,8 @@@ executed a memory-mapped I/O instructio
  by kvm.  The 'data' member contains the written data if 'is_write' is
  true, and should be filled by application code otherwise.
  
- NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR
-       and KVM_EXIT_PAPR the corresponding
+ NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
+       KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
  operations are complete (and guest state is consistent) only after userspace
  has re-entered the kernel with KVM_RUN.  The kernel side will first finish
  incomplete operations and then check for pending signals.  Userspace
@@@ -2463,6 -2338,41 +2459,41 @@@ The possible hypercalls are defined in 
  Requirements (PAPR) document available from www.power.org (free
  developer registration required to access it).
  
+               /* KVM_EXIT_S390_TSCH */
+               struct {
+                       __u16 subchannel_id;
+                       __u16 subchannel_nr;
+                       __u32 io_int_parm;
+                       __u32 io_int_word;
+                       __u32 ipb;
+                       __u8 dequeued;
+               } s390_tsch;
+ s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled
+ and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O
+ interrupt for the target subchannel has been dequeued and subchannel_id,
+ subchannel_nr, io_int_parm and io_int_word contain the parameters for that
+ interrupt. ipb is needed for instruction parameter decoding.
+               /* KVM_EXIT_EPR */
+               struct {
+                       __u32 epr;
+               } epr;
+ On FSL BookE PowerPC chips, the interrupt controller has a fast patch
+ interrupt acknowledge path to the core. When the core successfully
+ delivers an interrupt, it automatically populates the EPR register with
+ the interrupt vector number and acknowledges the interrupt inside
+ the interrupt controller.
+ In case the interrupt controller lives in user space, we need to do
+ the interrupt acknowledge cycle through it to fetch the next to be
+ delivered interrupt vector using this exit.
+ It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an
+ external interrupt has just been delivered into the guest. User space
+ should put the acknowledged interrupt vector into the 'epr' field.
                /* Fix the size of the union. */
                char padding[256];
        };
@@@ -2584,3 -2494,34 +2615,34 @@@ For mmu types KVM_MMU_FSL_BOOKE_NOHV an
     where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value.
   - The tsize field of mas1 shall be set to 4K on TLB0, even though the
     hardware ignores this value for TLB0.
+ 6.4 KVM_CAP_S390_CSS_SUPPORT
+ Architectures: s390
+ Parameters: none
+ Returns: 0 on success; -1 on error
+ This capability enables support for handling of channel I/O instructions.
+ TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are
+ handled in-kernel, while the other I/O instructions are passed to userspace.
+ When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST
+ SUBCHANNEL intercepts.
+ 6.5 KVM_CAP_PPC_EPR
+ Architectures: ppc
+ Parameters: args[0] defines whether the proxy facility is active
+ Returns: 0 on success; -1 on error
+ This capability enables or disables the delivery of interrupts through the
+ external proxy facility.
+ When enabled (args[0] != 0), every time the guest gets an external interrupt
+ delivered, it automatically exits into user space with a KVM_EXIT_EPR exit
+ to receive the topmost interrupt vector.
+ When disabled (args[0] == 0), behavior is as if this facility is unsupported.
+ When this capability is enabled, KVM_EXIT_EPR can occur.
index 03d7beae89a0ebb7fd0e66187db5c643d51b00f3,8a72d59467ebedd47356301813b79c887c5de6ee..d1bb86074721cf5394da11aca171fee1981bd72b
  
  #define KVM_MAX_VCPUS         NR_CPUS
  #define KVM_MAX_VCORES                NR_CPUS
- #define KVM_MEMORY_SLOTS 32
- /* memory slots that does not exposed to userspace */
- #define KVM_PRIVATE_MEM_SLOTS 4
- #define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+ #define KVM_USER_MEM_SLOTS 32
+ #define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS
  
  #ifdef CONFIG_KVM_MMIO
  #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@@ -440,7 -438,6 +438,7 @@@ struct kvm_vcpu_arch 
        ulong uamor;
        u32 ctrl;
        ulong dabr;
 +      ulong cfar;
  #endif
        u32 vrsave; /* also USPRG0 */
        u32 mmucr;
        u8 sane;
        u8 cpu_type;
        u8 hcall_needed;
+       u8 epr_enabled;
+       u8 epr_needed;
  
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
  
index 7035e608f3fa153fe248606a320af14fe0bed6dd,11ae3d8ba3a2a95fb4672a6e5a18b04b145d4539..e6658612203010aec0d3ba6ebb1f9c646e639575
  #define MSR_SF_LG     63              /* Enable 64 bit mode */
  #define MSR_ISF_LG    61              /* Interrupt 64b mode valid on 630 */
  #define MSR_HV_LG     60              /* Hypervisor state */
 +#define MSR_TS_T_LG   34              /* Trans Mem state: Transactional */
 +#define MSR_TS_S_LG   33              /* Trans Mem state: Suspended */
 +#define MSR_TS_LG     33              /* Trans Mem state (2 bits) */
 +#define MSR_TM_LG     32              /* Trans Mem Available */
  #define MSR_VEC_LG    25              /* Enable AltiVec */
  #define MSR_VSX_LG    23              /* Enable VSX */
  #define MSR_POW_LG    18              /* Enable Power Management */
  #define MSR_RI                __MASK(MSR_RI_LG)       /* Recoverable Exception */
  #define MSR_LE                __MASK(MSR_LE_LG)       /* Little Endian */
  
 +#define MSR_TM                __MASK(MSR_TM_LG)       /* Transactional Mem Available */
 +#define MSR_TS_N      0                       /*  Non-transactional */
 +#define MSR_TS_S      __MASK(MSR_TS_S_LG)     /*  Transaction Suspended */
 +#define MSR_TS_T      __MASK(MSR_TS_T_LG)     /*  Transaction Transactional */
 +#define MSR_TS_MASK   (MSR_TS_T | MSR_TS_S)   /* Transaction State bits */
 +#define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */
 +#define MSR_TM_TRANSACTIONAL(x)       (((x) & MSR_TS_MASK) == MSR_TS_T)
 +#define MSR_TM_SUSPENDED(x)   (((x) & MSR_TS_MASK) == MSR_TS_S)
 +
 +/* Reason codes describing kernel causes for transaction aborts.  By
 +   convention, bit0 is copied to TEXASR[56] (IBM bit 7) which is set if
 +   the failure is persistent.
 +*/
 +#define TM_CAUSE_RESCHED      0xfe
 +#define TM_CAUSE_TLBI         0xfc
 +#define TM_CAUSE_FAC_UNAV     0xfa
 +#define TM_CAUSE_SYSCALL      0xf9 /* Persistent */
 +#define TM_CAUSE_MISC         0xf6
 +#define TM_CAUSE_SIGNAL               0xf4
 +
  #if defined(CONFIG_PPC_BOOK3S_64)
  #define MSR_64BIT     MSR_SF
  
  #define SPRN_UAMOR    0x9d    /* User Authority Mask Override Register */
  #define SPRN_AMOR     0x15d   /* Authority Mask Override Register */
  #define SPRN_ACOP     0x1F    /* Available Coprocessor Register */
 +#define SPRN_TFIAR    0x81    /* Transaction Failure Inst Addr   */
 +#define SPRN_TEXASR   0x82    /* Transaction EXception & Summary */
 +#define SPRN_TEXASRU  0x83    /* ''      ''      ''    Upper 32  */
 +#define SPRN_TFHAR    0x80    /* Transaction Failure Handler Addr */
  #define SPRN_CTRLF    0x088
  #define SPRN_CTRLT    0x098
  #define   CTRL_CT     0xc0000000      /* current thread */
  #define   CTRL_CT1    0x40000000      /* thread 1 */
  #define   CTRL_TE     0x00c00000      /* thread enable */
  #define   CTRL_RUNLATCH       0x1
 +#define SPRN_DAWR     0xB4
 +#define SPRN_DAWRX    0xBC
 +#define   DAWRX_USER  (1UL << 0)
 +#define   DAWRX_KERNEL        (1UL << 1)
 +#define   DAWRX_HYP   (1UL << 2)
  #define SPRN_DABR     0x3F5   /* Data Address Breakpoint Register */
 -#define   DABR_TRANSLATION    (1UL << 2)
 -#define   DABR_DATA_WRITE     (1UL << 1)
 -#define   DABR_DATA_READ      (1UL << 0)
  #define SPRN_DABR2    0x13D   /* e300 */
  #define SPRN_DABRX    0x3F7   /* Data Address Breakpoint Register Extension */
  #define   DABRX_USER  (1UL << 0)
  #define SPRN_HRMOR    0x139   /* Real mode offset register */
  #define SPRN_HSRR0    0x13A   /* Hypervisor Save/Restore 0 */
  #define SPRN_HSRR1    0x13B   /* Hypervisor Save/Restore 1 */
 +#define SPRN_FSCR     0x099   /* Facility Status & Control Register */
 +#define FSCR_TAR      (1<<8)  /* Enable Target Adress Register */
 +#define SPRN_TAR      0x32f   /* Target Address Register */
  #define SPRN_LPCR     0x13E   /* LPAR Control Register */
  #define   LPCR_VPM0   (1ul << (63-0))
  #define   LPCR_VPM1   (1ul << (63-1))
  #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
  #define         LPCR_RMLS_SH  (63-37)
  #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 +#define   LPCR_AIL_0  0x00000000      /* MMU off exception offset 0x0 */
 +#define   LPCR_AIL_3  0x01800000      /* MMU on exception offset 0xc00...4xxx */
  #define   LPCR_PECE   0x00007000      /* powersave exit cause enable */
  #define     LPCR_PECE0        0x00004000      /* ext. exceptions can cause exit */
  #define     LPCR_PECE1        0x00002000      /* decrementer can cause exit */
  #define SPRN_DBAT6U   0x23C   /* Data BAT 6 Upper Register */
  #define SPRN_DBAT7L   0x23F   /* Data BAT 7 Lower Register */
  #define SPRN_DBAT7U   0x23E   /* Data BAT 7 Upper Register */
 +#define SPRN_PPR      0x380   /* SMT Thread status Register */
  
  #define SPRN_DEC      0x016           /* Decrement Register */
  #define SPRN_DER      0x095           /* Debug Enable Regsiter */
  #ifndef SPRN_PIR
  #define SPRN_PIR      0x3FF   /* Processor Identification Register */
  #endif
 +#define SPRN_TIR      0x1BE   /* Thread Identification Register */
  #define SPRN_PTEHI    0x3D5   /* 981 7450 PTE HI word (S/W TLB load) */
  #define SPRN_PTELO    0x3D6   /* 982 7450 PTE LO word (S/W TLB load) */
  #define SPRN_PURR     0x135   /* Processor Utilization of Resources Reg */
   *        HV mode in which case it is HSPRG0
   *
   * 64-bit server:
 - *    - SPRG0 unused (reserved for HV on Power4)
 + *    - SPRG0 scratch for TM recheckpoint/reclaim (reserved for HV on Power4)
   *    - SPRG2 scratch for exception vectors
   *    - SPRG3 CPU and NUMA node for VDSO getcpu (user visible)
   *      - HSPRG0 stores PACA in HV mode
  #define SPRN_SPRG_RSCRATCH_DBG        SPRN_SPRG9
  #define SPRN_SPRG_WSCRATCH_DBG        SPRN_SPRG9
  #endif
- #define SPRN_SPRG_RVCPU               SPRN_SPRG1
- #define SPRN_SPRG_WVCPU               SPRN_SPRG1
  #endif
  
  #ifdef CONFIG_8xx
  #define PVR_970MP     0x0044
  #define PVR_970GX     0x0045
  #define PVR_POWER7p   0x004A
 +#define PVR_POWER8    0x004B
  #define PVR_BE                0x0070
  #define PVR_PA6T      0x0090
  
index 781190367292514e9d1fd39fdda0ab62fea54a3b,46f6afd2172aac2b006c21b9c9a870a624bad24b..b6c17ec9b1691c80de11785b9ea850930d778298
@@@ -77,7 -77,6 +77,7 @@@ int main(void
        DEFINE(NMI_MASK, NMI_MASK);
        DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr));
        DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit));
 +      DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr));
  #else
        DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
  #endif /* CONFIG_PPC64 */
  #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
        DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
  #endif
- #ifdef CONFIG_KVM_BOOKE_HV
+ #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
        DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
  #endif
  
 +#ifdef CONFIG_PPC_BOOK3S_64
 +      DEFINE(THREAD_TAR, offsetof(struct thread_struct, tar));
 +#endif
 +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 +      DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
 +      DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar));
 +      DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr));
 +      DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar));
 +      DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs));
 +      DEFINE(THREAD_TRANSACT_VR0, offsetof(struct thread_struct,
 +                                       transact_vr[0]));
 +      DEFINE(THREAD_TRANSACT_VSCR, offsetof(struct thread_struct,
 +                                        transact_vscr));
 +      DEFINE(THREAD_TRANSACT_VRSAVE, offsetof(struct thread_struct,
 +                                          transact_vrsave));
 +      DEFINE(THREAD_TRANSACT_FPR0, offsetof(struct thread_struct,
 +                                        transact_fpr[0]));
 +      DEFINE(THREAD_TRANSACT_FPSCR, offsetof(struct thread_struct,
 +                                         transact_fpscr));
 +#ifdef CONFIG_VSX
 +      DEFINE(THREAD_TRANSACT_VSR0, offsetof(struct thread_struct,
 +                                        transact_fpr[0]));
 +#endif
 +      /* Local pt_regs on stack for Transactional Memory funcs. */
 +      DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
 +             sizeof(struct pt_regs) + 16);
 +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 +
        DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
        DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
        DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
        DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
        DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
 +      DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
        DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
        DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
        DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
        DEFINE(IPI_PRIORITY, IPI_PRIORITY);
  #endif /* CONFIG_KVM_BOOK3S_64_HV */
  
 +#ifdef CONFIG_PPC_BOOK3S_64
 +      HSTATE_FIELD(HSTATE_CFAR, cfar);
 +#endif /* CONFIG_PPC_BOOK3S_64 */
 +
  #else /* CONFIG_PPC_BOOK3S */
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
        DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
index 6702442ca81899c2b92948339fd087c5bb15509e,73ed11c41bacfe691f60607c5519b179ae13f3cf..5e93438afb068c89b27062cec2bb772d4e77ce4c
@@@ -34,8 -34,6 +34,8 @@@
  #include <asm/kvm_book3s.h>
  #include <asm/mmu_context.h>
  #include <asm/switch_to.h>
 +#include <asm/firmware.h>
 +#include <asm/hvcall.h>
  #include <linux/gfp.h>
  #include <linux/sched.h>
  #include <linux/vmalloc.h>
@@@ -762,6 -760,11 +762,11 @@@ program_interrupt
                        run->exit_reason = KVM_EXIT_MMIO;
                        r = RESUME_HOST_NV;
                        break;
+               case EMULATE_DO_PAPR:
+                       run->exit_reason = KVM_EXIT_PAPR_HCALL;
+                       vcpu->arch.hcall_needed = 1;
+                       r = RESUME_HOST_NV;
+                       break;
                default:
                        BUG();
                }
@@@ -1286,21 -1289,12 +1291,21 @@@ void kvmppc_core_flush_memslot(struct k
  {
  }
  
 +static unsigned int kvm_global_user_count = 0;
 +static DEFINE_SPINLOCK(kvm_global_user_count_lock);
 +
  int kvmppc_core_init_vm(struct kvm *kvm)
  {
  #ifdef CONFIG_PPC64
        INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
  #endif
  
 +      if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
 +              spin_lock(&kvm_global_user_count_lock);
 +              if (++kvm_global_user_count == 1)
 +                      pSeries_disable_reloc_on_exc();
 +              spin_unlock(&kvm_global_user_count_lock);
 +      }
        return 0;
  }
  
@@@ -1309,14 -1303,6 +1314,14 @@@ void kvmppc_core_destroy_vm(struct kvm 
  #ifdef CONFIG_PPC64
        WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
  #endif
 +
 +      if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
 +              spin_lock(&kvm_global_user_count_lock);
 +              BUG_ON(kvm_global_user_count == 0);
 +              if (--kvm_global_user_count == 0)
 +                      pSeries_enable_reloc_on_exc();
 +              spin_unlock(&kvm_global_user_count_lock);
 +      }
  }
  
  static int kvmppc_book3s_init(void)
index 9d9cddc5b346ff9518009539c08804b1cf0f9682,71abcf4e2bdafa13a8226bea831a49545c9c2810..7a73b6f72a8ba4a9031d3426c93b32abc03e0951
@@@ -39,7 -39,6 +39,7 @@@
  #define OP_31_XOP_TRAP      4
  #define OP_31_XOP_LWZX      23
  #define OP_31_XOP_TRAP_64   68
 +#define OP_31_XOP_DCBF      86
  #define OP_31_XOP_LBZX      87
  #define OP_31_XOP_STWX      151
  #define OP_31_XOP_STBX      215
@@@ -150,8 -149,6 +150,6 @@@ static int kvmppc_emulate_mtspr(struct 
        case SPRN_TBWL: break;
        case SPRN_TBWU: break;
  
-       case SPRN_MSSSR0: break;
        case SPRN_DEC:
                vcpu->arch.dec = spr_val;
                kvmppc_emulate_dec(vcpu);
@@@ -202,9 -199,6 +200,6 @@@ static int kvmppc_emulate_mfspr(struct 
        case SPRN_PIR:
                spr_val = vcpu->vcpu_id;
                break;
-       case SPRN_MSSSR0:
-               spr_val = 0;
-               break;
  
        /* Note: mftb and TBRL/TBWL are user-accessible, so
         * the guest can always access the real TB anyways.
@@@ -375,7 -369,6 +370,7 @@@ int kvmppc_emulate_instruction(struct k
                        emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
                        break;
  
 +              case OP_31_XOP_DCBF:
                case OP_31_XOP_DCBI:
                        /* Do nothing. The guest is performing dcbi because
                         * hardware DMA is not snooped by the dcache, but
index 7def77302d630995a1018cd2ffe02a05ee289e10,aa6d0d74cec91ca06d409655a8d5bb2a60cb85e7..87c17bfb2968e8423fed70784417b36384754f91
@@@ -2,61 -2,44 +2,62 @@@
  #define _ASM_IRQ_H
  
  #include <linux/hardirq.h>
 +#include <linux/percpu.h>
 +#include <linux/cache.h>
  #include <linux/types.h>
  
 -enum interruption_class {
 +enum interruption_main_class {
        EXTERNAL_INTERRUPT,
        IO_INTERRUPT,
 -      EXTINT_CLK,
 -      EXTINT_EXC,
 -      EXTINT_EMS,
 -      EXTINT_TMR,
 -      EXTINT_TLA,
 -      EXTINT_PFL,
 -      EXTINT_DSD,
 -      EXTINT_VRT,
 -      EXTINT_SCP,
 -      EXTINT_IUC,
 -      EXTINT_CMS,
 -      EXTINT_CMC,
 -      EXTINT_CMR,
 -      IOINT_CIO,
 -      IOINT_QAI,
 -      IOINT_DAS,
 -      IOINT_C15,
 -      IOINT_C70,
 -      IOINT_TAP,
 -      IOINT_VMR,
 -      IOINT_LCS,
 -      IOINT_CLW,
 -      IOINT_CTC,
 -      IOINT_APB,
 -      IOINT_ADM,
 -      IOINT_CSC,
 -      IOINT_PCI,
 -      IOINT_MSI,
 -      IOINT_VIR,
 +      NR_IRQS
 +};
 +
 +enum interruption_class {
 +      IRQEXT_CLK,
 +      IRQEXT_EXC,
 +      IRQEXT_EMS,
 +      IRQEXT_TMR,
 +      IRQEXT_TLA,
 +      IRQEXT_PFL,
 +      IRQEXT_DSD,
 +      IRQEXT_VRT,
 +      IRQEXT_SCP,
 +      IRQEXT_IUC,
 +      IRQEXT_CMS,
 +      IRQEXT_CMC,
 +      IRQEXT_CMR,
 +      IRQIO_CIO,
 +      IRQIO_QAI,
 +      IRQIO_DAS,
 +      IRQIO_C15,
 +      IRQIO_C70,
 +      IRQIO_TAP,
 +      IRQIO_VMR,
 +      IRQIO_LCS,
 +      IRQIO_CLW,
 +      IRQIO_CTC,
 +      IRQIO_APB,
 +      IRQIO_ADM,
 +      IRQIO_CSC,
 +      IRQIO_PCI,
 +      IRQIO_MSI,
++      IRQIO_VIR,
        NMI_NMI,
 -      NR_IRQS,
 +      CPU_RST,
 +      NR_ARCH_IRQS
  };
  
 +struct irq_stat {
 +      unsigned int irqs[NR_ARCH_IRQS];
 +};
 +
 +DECLARE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
 +
 +static __always_inline void inc_irq_stat(enum interruption_class irq)
 +{
 +      __get_cpu_var(irq_stat).irqs[irq]++;
 +}
 +
  struct ext_code {
        unsigned short subcode;
        unsigned short code;
diff --combined arch/s390/kernel/irq.c
index 9df824ea16672aea9e8f6a93ec6c925f7b982a3a,a9806ea3ebd7244db2a45b370927946fdecbfe01..1630f439cd2a567d27d0ddb5828917808fec5f15
  #include <asm/irq.h>
  #include "entry.h"
  
 +DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
 +EXPORT_PER_CPU_SYMBOL_GPL(irq_stat);
 +
  struct irq_class {
        char *name;
        char *desc;
  };
  
 -static const struct irq_class intrclass_names[] = {
 +/*
 + * The list of "main" irq classes on s390. This is the list of interrrupts
 + * that appear both in /proc/stat ("intr" line) and /proc/interrupts.
 + * Historically only external and I/O interrupts have been part of /proc/stat.
 + * We can't add the split external and I/O sub classes since the first field
 + * in the "intr" line in /proc/stat is supposed to be the sum of all other
 + * fields.
 + * Since the external and I/O interrupt fields are already sums we would end
 + * up with having a sum which accounts each interrupt twice.
 + */
 +static const struct irq_class irqclass_main_desc[NR_IRQS] = {
        [EXTERNAL_INTERRUPT] = {.name = "EXT"},
 -      [IO_INTERRUPT]       = {.name = "I/O"},
 -      [EXTINT_CLK] = {.name = "CLK", .desc = "[EXT] Clock Comparator"},
 -      [EXTINT_EXC] = {.name = "EXC", .desc = "[EXT] External Call"},
 -      [EXTINT_EMS] = {.name = "EMS", .desc = "[EXT] Emergency Signal"},
 -      [EXTINT_TMR] = {.name = "TMR", .desc = "[EXT] CPU Timer"},
 -      [EXTINT_TLA] = {.name = "TAL", .desc = "[EXT] Timing Alert"},
 -      [EXTINT_PFL] = {.name = "PFL", .desc = "[EXT] Pseudo Page Fault"},
 -      [EXTINT_DSD] = {.name = "DSD", .desc = "[EXT] DASD Diag"},
 -      [EXTINT_VRT] = {.name = "VRT", .desc = "[EXT] Virtio"},
 -      [EXTINT_SCP] = {.name = "SCP", .desc = "[EXT] Service Call"},
 -      [EXTINT_IUC] = {.name = "IUC", .desc = "[EXT] IUCV"},
 -      [EXTINT_CMS] = {.name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"},
 -      [EXTINT_CMC] = {.name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"},
 -      [EXTINT_CMR] = {.name = "CMR", .desc = "[EXT] CPU-Measurement: RI"},
 -      [IOINT_CIO]  = {.name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"},
 -      [IOINT_QAI]  = {.name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"},
 -      [IOINT_DAS]  = {.name = "DAS", .desc = "[I/O] DASD"},
 -      [IOINT_C15]  = {.name = "C15", .desc = "[I/O] 3215"},
 -      [IOINT_C70]  = {.name = "C70", .desc = "[I/O] 3270"},
 -      [IOINT_TAP]  = {.name = "TAP", .desc = "[I/O] Tape"},
 -      [IOINT_VMR]  = {.name = "VMR", .desc = "[I/O] Unit Record Devices"},
 -      [IOINT_LCS]  = {.name = "LCS", .desc = "[I/O] LCS"},
 -      [IOINT_CLW]  = {.name = "CLW", .desc = "[I/O] CLAW"},
 -      [IOINT_CTC]  = {.name = "CTC", .desc = "[I/O] CTC"},
 -      [IOINT_APB]  = {.name = "APB", .desc = "[I/O] AP Bus"},
 -      [IOINT_ADM]  = {.name = "ADM", .desc = "[I/O] EADM Subchannel"},
 -      [IOINT_CSC]  = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"},
 -      [IOINT_PCI]  = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
 -      [IOINT_MSI] =  {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
 -      [IOINT_VIR]  = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
 +      [IO_INTERRUPT]       = {.name = "I/O"}
 +};
 +
 +/*
 + * The list of split external and I/O interrupts that appear only in
 + * /proc/interrupts.
 + * In addition this list contains non external / I/O events like NMIs.
 + */
 +static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
 +      [IRQEXT_CLK] = {.name = "CLK", .desc = "[EXT] Clock Comparator"},
 +      [IRQEXT_EXC] = {.name = "EXC", .desc = "[EXT] External Call"},
 +      [IRQEXT_EMS] = {.name = "EMS", .desc = "[EXT] Emergency Signal"},
 +      [IRQEXT_TMR] = {.name = "TMR", .desc = "[EXT] CPU Timer"},
 +      [IRQEXT_TLA] = {.name = "TAL", .desc = "[EXT] Timing Alert"},
 +      [IRQEXT_PFL] = {.name = "PFL", .desc = "[EXT] Pseudo Page Fault"},
 +      [IRQEXT_DSD] = {.name = "DSD", .desc = "[EXT] DASD Diag"},
 +      [IRQEXT_VRT] = {.name = "VRT", .desc = "[EXT] Virtio"},
 +      [IRQEXT_SCP] = {.name = "SCP", .desc = "[EXT] Service Call"},
 +      [IRQEXT_IUC] = {.name = "IUC", .desc = "[EXT] IUCV"},
 +      [IRQEXT_CMS] = {.name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"},
 +      [IRQEXT_CMC] = {.name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"},
 +      [IRQEXT_CMR] = {.name = "CMR", .desc = "[EXT] CPU-Measurement: RI"},
 +      [IRQIO_CIO]  = {.name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"},
 +      [IRQIO_QAI]  = {.name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"},
 +      [IRQIO_DAS]  = {.name = "DAS", .desc = "[I/O] DASD"},
 +      [IRQIO_C15]  = {.name = "C15", .desc = "[I/O] 3215"},
 +      [IRQIO_C70]  = {.name = "C70", .desc = "[I/O] 3270"},
 +      [IRQIO_TAP]  = {.name = "TAP", .desc = "[I/O] Tape"},
 +      [IRQIO_VMR]  = {.name = "VMR", .desc = "[I/O] Unit Record Devices"},
 +      [IRQIO_LCS]  = {.name = "LCS", .desc = "[I/O] LCS"},
 +      [IRQIO_CLW]  = {.name = "CLW", .desc = "[I/O] CLAW"},
 +      [IRQIO_CTC]  = {.name = "CTC", .desc = "[I/O] CTC"},
 +      [IRQIO_APB]  = {.name = "APB", .desc = "[I/O] AP Bus"},
 +      [IRQIO_ADM]  = {.name = "ADM", .desc = "[I/O] EADM Subchannel"},
 +      [IRQIO_CSC]  = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"},
 +      [IRQIO_PCI]  = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
 +      [IRQIO_MSI]  = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
++      [IRQIO_VIR]  = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
        [NMI_NMI]    = {.name = "NMI", .desc = "[NMI] Machine Check"},
 +      [CPU_RST]    = {.name = "RST", .desc = "[CPU] CPU Restart"},
  };
  
  /*
   */
  int show_interrupts(struct seq_file *p, void *v)
  {
 -      int i = *(loff_t *) v, j;
 +      int irq = *(loff_t *) v;
 +      int cpu;
  
        get_online_cpus();
 -      if (i == 0) {
 +      if (irq == 0) {
                seq_puts(p, "           ");
 -              for_each_online_cpu(j)
 -                      seq_printf(p, "CPU%d       ",j);
 +              for_each_online_cpu(cpu)
 +                      seq_printf(p, "CPU%d       ", cpu);
                seq_putc(p, '\n');
        }
 -
 -      if (i < NR_IRQS) {
 -              seq_printf(p, "%s: ", intrclass_names[i].name);
 -#ifndef CONFIG_SMP
 -              seq_printf(p, "%10u ", kstat_irqs(i));
 -#else
 -              for_each_online_cpu(j)
 -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 -#endif
 -              if (intrclass_names[i].desc)
 -                      seq_printf(p, "  %s", intrclass_names[i].desc);
 -                seq_putc(p, '\n');
 -        }
 +      if (irq < NR_IRQS) {
 +              seq_printf(p, "%s: ", irqclass_main_desc[irq].name);
 +              for_each_online_cpu(cpu)
 +                      seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[irq]);
 +              seq_putc(p, '\n');
 +              goto skip_arch_irqs;
 +      }
 +      for (irq = 0; irq < NR_ARCH_IRQS; irq++) {
 +              seq_printf(p, "%s: ", irqclass_sub_desc[irq].name);
 +              for_each_online_cpu(cpu)
 +                      seq_printf(p, "%10u ", per_cpu(irq_stat, cpu).irqs[irq]);
 +              if (irqclass_sub_desc[irq].desc)
 +                      seq_printf(p, "  %s", irqclass_sub_desc[irq].desc);
 +              seq_putc(p, '\n');
 +      }
 +skip_arch_irqs:
        put_online_cpus();
 -        return 0;
 +      return 0;
  }
  
  /*
@@@ -248,7 -223,7 +249,7 @@@ void __irq_entry do_extint(struct pt_re
                /* Serve timer interrupts first. */
                clock_comparator_work();
        }
 -      kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++;
 +      kstat_incr_irqs_this_cpu(EXTERNAL_INTERRUPT, NULL);
        if (ext_code.code != 0x1004)
                __get_cpu_var(s390_idle).nohz_delay = 1;
  
index 87418b50f21cec676929159f84765152a0395e53,2f6ccb065c4aa47aca03c1f5276ec2b253b01f0b..37116a77cb4b8b16bc8dbb698e34ced1510b03f5
  #include "gaccess.h"
  #include "trace-s390.h"
  
+ #define IOINT_SCHID_MASK 0x0000ffff
+ #define IOINT_SSID_MASK 0x00030000
+ #define IOINT_CSSID_MASK 0x03fc0000
+ #define IOINT_AI_MASK 0x04000000
+ static int is_ioint(u64 type)
+ {
+       return ((type & 0xfffe0000u) != 0xfffe0000u);
+ }
  static int psw_extint_disabled(struct kvm_vcpu *vcpu)
  {
        return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
  }
  
+ static int psw_ioint_disabled(struct kvm_vcpu *vcpu)
+ {
+       return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO);
+ }
+ static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
+ {
+       return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK);
+ }
  static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
  {
        if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
        return 1;
  }
  
+ static u64 int_word_to_isc_bits(u32 int_word)
+ {
+       u8 isc = (int_word & 0x38000000) >> 27;
+       return (0x80 >> isc) << 24;
+ }
  static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
                                      struct kvm_s390_interrupt_info *inti)
  {
        case KVM_S390_SIGP_SET_PREFIX:
        case KVM_S390_RESTART:
                return 1;
+       case KVM_S390_MCHK:
+               if (psw_mchk_disabled(vcpu))
+                       return 0;
+               if (vcpu->arch.sie_block->gcr[14] & inti->mchk.cr14)
+                       return 1;
+               return 0;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               if (psw_ioint_disabled(vcpu))
+                       return 0;
+               if (vcpu->arch.sie_block->gcr[6] &
+                   int_word_to_isc_bits(inti->io.io_int_word))
+                       return 1;
+               return 0;
        default:
+               printk(KERN_WARNING "illegal interrupt type %llx\n",
+                      inti->type);
                BUG();
        }
        return 0;
@@@ -93,6 -135,7 +135,7 @@@ static void __reset_intercept_indicator
                CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
                &vcpu->arch.sie_block->cpuflags);
        vcpu->arch.sie_block->lctl = 0x0000;
+       vcpu->arch.sie_block->ictl &= ~ICTL_LPSW;
  }
  
  static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@@ -116,6 -159,18 +159,18 @@@ static void __set_intercept_indicator(s
        case KVM_S390_SIGP_STOP:
                __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
                break;
+       case KVM_S390_MCHK:
+               if (psw_mchk_disabled(vcpu))
+                       vcpu->arch.sie_block->ictl |= ICTL_LPSW;
+               else
+                       vcpu->arch.sie_block->lctl |= LCTL_CR14;
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               if (psw_ioint_disabled(vcpu))
+                       __set_cpuflag(vcpu, CPUSTAT_IO_INT);
+               else
+                       vcpu->arch.sie_block->lctl |= LCTL_CR6;
+               break;
        default:
                BUG();
        }
@@@ -297,6 -352,73 +352,73 @@@ static void __do_deliver_interrupt(stru
                        exception = 1;
                break;
  
+       case KVM_S390_MCHK:
+               VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
+                          inti->mchk.mcic);
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->mchk.cr14,
+                                                inti->mchk.mcic);
+               rc = kvm_s390_vcpu_store_status(vcpu,
+                                               KVM_S390_STORE_STATUS_PREFIXED);
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic);
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
+                                  &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                    __LC_MCK_NEW_PSW, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+       {
+               __u32 param0 = ((__u32)inti->io.subchannel_id << 16) |
+                       inti->io.subchannel_nr;
+               __u64 param1 = ((__u64)inti->io.io_int_parm << 32) |
+                       inti->io.io_int_word;
+               VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
+               vcpu->stat.deliver_io_int++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                param0, param1);
+               rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID,
+                                  inti->io.subchannel_id);
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR,
+                                  inti->io.subchannel_nr);
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = put_guest_u32(vcpu, __LC_IO_INT_PARM,
+                                  inti->io.io_int_parm);
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = put_guest_u32(vcpu, __LC_IO_INT_WORD,
+                                  inti->io.io_int_word);
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW,
+                                  &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                    __LC_IO_NEW_PSW, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+               break;
+       }
        default:
                BUG();
        }
@@@ -362,7 -484,7 +484,7 @@@ static int kvm_cpu_has_interrupt(struc
        }
  
        if ((!rc) && (vcpu->arch.sie_block->ckc <
 -              get_clock() + vcpu->arch.sie_block->epoch)) {
 +              get_tod_clock() + vcpu->arch.sie_block->epoch)) {
                if ((!psw_extint_disabled(vcpu)) &&
                        (vcpu->arch.sie_block->gcr[0] & 0x800ul))
                        rc = 1;
@@@ -402,13 -524,13 +524,13 @@@ int kvm_s390_handle_wait(struct kvm_vcp
                goto no_timer;
        }
  
 -      now = get_clock() + vcpu->arch.sie_block->epoch;
 +      now = get_tod_clock() + vcpu->arch.sie_block->epoch;
        if (vcpu->arch.sie_block->ckc < now) {
                __unset_cpu_idle(vcpu);
                return 0;
        }
  
 -      sltime = ((vcpu->arch.sie_block->ckc - now)*125)>>9;
 +      sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
  
        hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
        VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
@@@ -492,7 -614,7 +614,7 @@@ void kvm_s390_deliver_pending_interrupt
        }
  
        if ((vcpu->arch.sie_block->ckc <
 -              get_clock() + vcpu->arch.sie_block->epoch))
 +              get_tod_clock() + vcpu->arch.sie_block->epoch))
                __try_deliver_ckc_interrupt(vcpu);
  
        if (atomic_read(&fi->active)) {
        }
  }
  
+ void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
+       struct kvm_s390_interrupt_info  *n, *inti = NULL;
+       int deliver;
+       __reset_intercept_indicators(vcpu);
+       if (atomic_read(&li->active)) {
+               do {
+                       deliver = 0;
+                       spin_lock_bh(&li->lock);
+                       list_for_each_entry_safe(inti, n, &li->list, list) {
+                               if ((inti->type == KVM_S390_MCHK) &&
+                                   __interrupt_is_deliverable(vcpu, inti)) {
+                                       list_del(&inti->list);
+                                       deliver = 1;
+                                       break;
+                               }
+                               __set_intercept_indicator(vcpu, inti);
+                       }
+                       if (list_empty(&li->list))
+                               atomic_set(&li->active, 0);
+                       spin_unlock_bh(&li->lock);
+                       if (deliver) {
+                               __do_deliver_interrupt(vcpu, inti);
+                               kfree(inti);
+                       }
+               } while (deliver);
+       }
+       if (atomic_read(&fi->active)) {
+               do {
+                       deliver = 0;
+                       spin_lock(&fi->lock);
+                       list_for_each_entry_safe(inti, n, &fi->list, list) {
+                               if ((inti->type == KVM_S390_MCHK) &&
+                                   __interrupt_is_deliverable(vcpu, inti)) {
+                                       list_del(&inti->list);
+                                       deliver = 1;
+                                       break;
+                               }
+                               __set_intercept_indicator(vcpu, inti);
+                       }
+                       if (list_empty(&fi->list))
+                               atomic_set(&fi->active, 0);
+                       spin_unlock(&fi->lock);
+                       if (deliver) {
+                               __do_deliver_interrupt(vcpu, inti);
+                               kfree(inti);
+                       }
+               } while (deliver);
+       }
+ }
  int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
  {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        return 0;
  }
  
+ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
+                                                   u64 cr6, u64 schid)
+ {
+       struct kvm_s390_float_interrupt *fi;
+       struct kvm_s390_interrupt_info *inti, *iter;
+       if ((!schid && !cr6) || (schid && cr6))
+               return NULL;
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+       inti = NULL;
+       list_for_each_entry(iter, &fi->list, list) {
+               if (!is_ioint(iter->type))
+                       continue;
+               if (cr6 &&
+                   ((cr6 & int_word_to_isc_bits(iter->io.io_int_word)) == 0))
+                       continue;
+               if (schid) {
+                       if (((schid & 0x00000000ffff0000) >> 16) !=
+                           iter->io.subchannel_id)
+                               continue;
+                       if ((schid & 0x000000000000ffff) !=
+                           iter->io.subchannel_nr)
+                               continue;
+               }
+               inti = iter;
+               break;
+       }
+       if (inti)
+               list_del_init(&inti->list);
+       if (list_empty(&fi->list))
+               atomic_set(&fi->active, 0);
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+       return inti;
+ }
  int kvm_s390_inject_vm(struct kvm *kvm,
                       struct kvm_s390_interrupt *s390int)
  {
        struct kvm_s390_local_interrupt *li;
        struct kvm_s390_float_interrupt *fi;
-       struct kvm_s390_interrupt_info *inti;
+       struct kvm_s390_interrupt_info *inti, *iter;
        int sigcpu;
  
        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
        case KVM_S390_SIGP_STOP:
        case KVM_S390_INT_EXTERNAL_CALL:
        case KVM_S390_INT_EMERGENCY:
+               kfree(inti);
+               return -EINVAL;
+       case KVM_S390_MCHK:
+               VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
+                        s390int->parm64);
+               inti->type = s390int->type;
+               inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
+               inti->mchk.mcic = s390int->parm64;
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               if (s390int->type & IOINT_AI_MASK)
+                       VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
+               else
+                       VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
+                                s390int->type & IOINT_CSSID_MASK,
+                                s390int->type & IOINT_SSID_MASK,
+                                s390int->type & IOINT_SCHID_MASK);
+               inti->type = s390int->type;
+               inti->io.subchannel_id = s390int->parm >> 16;
+               inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
+               inti->io.io_int_parm = s390int->parm64 >> 32;
+               inti->io.io_int_word = s390int->parm64 & 0x00000000ffffffffull;
+               break;
        default:
                kfree(inti);
                return -EINVAL;
        mutex_lock(&kvm->lock);
        fi = &kvm->arch.float_int;
        spin_lock(&fi->lock);
-       list_add_tail(&inti->list, &fi->list);
+       if (!is_ioint(inti->type))
+               list_add_tail(&inti->list, &fi->list);
+       else {
+               u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+               /* Keep I/O interrupts sorted in isc order. */
+               list_for_each_entry(iter, &fi->list, list) {
+                       if (!is_ioint(iter->type))
+                               continue;
+                       if (int_word_to_isc_bits(iter->io.io_int_word)
+                           <= isc_bits)
+                               continue;
+                       break;
+               }
+               list_add_tail(&inti->list, &iter->list);
+       }
        atomic_set(&fi->active, 1);
        sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
        if (sigcpu == KVM_MAX_VCPUS) {
@@@ -651,8 -904,15 +904,15 @@@ int kvm_s390_inject_vcpu(struct kvm_vcp
                inti->type = s390int->type;
                inti->emerg.code = s390int->parm;
                break;
+       case KVM_S390_MCHK:
+               VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
+                          s390int->parm64);
+               inti->type = s390int->type;
+               inti->mchk.mcic = s390int->parm64;
+               break;
        case KVM_S390_INT_VIRTIO:
        case KVM_S390_INT_SERVICE:
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
        default:
                kfree(inti);
                return -EINVAL;
diff --combined arch/s390/kvm/kvm-s390.c
index 2923781590a610a49a7efa33f4f5a8758011774e,4377d18866319e96c3d2ecf5e313701283fe96cb..4cf35a0a79e7734b6cd99803f3f54bd2df5c6396
@@@ -140,6 -140,8 +140,8 @@@ int kvm_dev_ioctl_check_extension(long 
  #endif
        case KVM_CAP_SYNC_REGS:
        case KVM_CAP_ONE_REG:
+       case KVM_CAP_ENABLE_CAP:
+       case KVM_CAP_S390_CSS_SUPPORT:
                r = 1;
                break;
        case KVM_CAP_NR_VCPUS:
                r = KVM_MAX_VCPUS;
                break;
        case KVM_CAP_S390_COW:
 -              r = sclp_get_fac85() & 0x2;
 +              r = MACHINE_HAS_ESOP;
                break;
        default:
                r = 0;
@@@ -234,6 -236,9 +236,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
                if (!kvm->arch.gmap)
                        goto out_nogmap;
        }
+       kvm->arch.css_support = 0;
        return 0;
  out_nogmap:
        debug_unregister(kvm->arch.dbf);
@@@ -613,9 -618,7 +618,9 @@@ static int __vcpu_run(struct kvm_vcpu *
                kvm_s390_deliver_pending_interrupts(vcpu);
  
        vcpu->arch.sie_block->icptcode = 0;
 +      preempt_disable();
        kvm_guest_enter();
 +      preempt_enable();
        VCPU_EVENT(vcpu, 6, "entering sie flags %x",
                   atomic_read(&vcpu->arch.sie_block->cpuflags));
        trace_kvm_s390_sie_enter(vcpu,
@@@ -659,6 -662,7 +664,7 @@@ rerun_vcpu
        case KVM_EXIT_INTR:
        case KVM_EXIT_S390_RESET:
        case KVM_EXIT_S390_UCONTROL:
+       case KVM_EXIT_S390_TSCH:
                break;
        default:
                BUG();
@@@ -766,6 -770,14 +772,14 @@@ int kvm_s390_vcpu_store_status(struct k
        } else
                prefix = 0;
  
+       /*
+        * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
+        * copying in vcpu load/put. Lets update our copies before we save
+        * it into the save area
+        */
+       save_fp_regs(&vcpu->arch.guest_fpregs);
+       save_access_regs(vcpu->run->s.regs.acrs);
        if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
                        vcpu->arch.guest_fpregs.fprs, 128, prefix))
                return -EFAULT;
        return 0;
  }
  
+ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+                                    struct kvm_enable_cap *cap)
+ {
+       int r;
+       if (cap->flags)
+               return -EINVAL;
+       switch (cap->cap) {
+       case KVM_CAP_S390_CSS_SUPPORT:
+               if (!vcpu->kvm->arch.css_support) {
+                       vcpu->kvm->arch.css_support = 1;
+                       trace_kvm_s390_enable_css(vcpu->kvm);
+               }
+               r = 0;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+ }
  long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
  {
                        r = 0;
                break;
        }
+       case KVM_ENABLE_CAP:
+       {
+               struct kvm_enable_cap cap;
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       break;
+               r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+               break;
+       }
        default:
                r = -ENOTTY;
        }
@@@ -930,7 -974,7 +976,7 @@@ int kvm_arch_prepare_memory_region(stru
                                   struct kvm_memory_slot *memslot,
                                   struct kvm_memory_slot old,
                                   struct kvm_userspace_memory_region *mem,
-                                  int user_alloc)
+                                  bool user_alloc)
  {
        /* A few sanity checks. We can have exactly one memory slot which has
           to start at guest virtual zero and which has to be located at a
  void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
                                struct kvm_memory_slot old,
-                               int user_alloc)
+                               bool user_alloc)
  {
        int rc;
  
index 65231e173bafceb5895f49da00b95f39c3088beb,f49c16d47581dcab236ccd096c6d0ecd322b2a17..695399f2d5eb315a62823a9fb8b1673a90447b04
@@@ -1,8 -1,103 +1,8 @@@
  #ifndef _ASM_X86_KVM_PARA_H
  #define _ASM_X86_KVM_PARA_H
  
 -#include <linux/types.h>
 -#include <asm/hyperv.h>
 -
 -/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
 - * should be used to determine that a VM is running under KVM.
 - */
 -#define KVM_CPUID_SIGNATURE   0x40000000
 -
 -/* This CPUID returns a feature bitmap in eax.  Before enabling a particular
 - * paravirtualization, the appropriate feature bit should be checked.
 - */
 -#define KVM_CPUID_FEATURES    0x40000001
 -#define KVM_FEATURE_CLOCKSOURCE               0
 -#define KVM_FEATURE_NOP_IO_DELAY      1
 -#define KVM_FEATURE_MMU_OP            2
 -/* This indicates that the new set of kvmclock msrs
 - * are available. The use of 0x11 and 0x12 is deprecated
 - */
 -#define KVM_FEATURE_CLOCKSOURCE2        3
 -#define KVM_FEATURE_ASYNC_PF          4
 -#define KVM_FEATURE_STEAL_TIME                5
 -#define KVM_FEATURE_PV_EOI            6
 -
 -/* The last 8 bits are used to indicate how to interpret the flags field
 - * in pvclock structure. If no bits are set, all flags are ignored.
 - */
 -#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT    24
 -
 -#define MSR_KVM_WALL_CLOCK  0x11
 -#define MSR_KVM_SYSTEM_TIME 0x12
 -
 -#define KVM_MSR_ENABLED 1
 -/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 -#define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 -#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 -#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 -#define MSR_KVM_STEAL_TIME  0x4b564d03
 -#define MSR_KVM_PV_EOI_EN      0x4b564d04
 -
 -struct kvm_steal_time {
 -      __u64 steal;
 -      __u32 version;
 -      __u32 flags;
 -      __u32 pad[12];
 -};
 -
 -#define KVM_STEAL_ALIGNMENT_BITS 5
 -#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
 -#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
 -
 -#define KVM_MAX_MMU_OP_BATCH           32
 -
 -#define KVM_ASYNC_PF_ENABLED                  (1 << 0)
 -#define KVM_ASYNC_PF_SEND_ALWAYS              (1 << 1)
 -
 -/* Operations for KVM_HC_MMU_OP */
 -#define KVM_MMU_OP_WRITE_PTE            1
 -#define KVM_MMU_OP_FLUSH_TLB          2
 -#define KVM_MMU_OP_RELEASE_PT         3
 -
 -/* Payload for KVM_HC_MMU_OP */
 -struct kvm_mmu_op_header {
 -      __u32 op;
 -      __u32 pad;
 -};
 -
 -struct kvm_mmu_op_write_pte {
 -      struct kvm_mmu_op_header header;
 -      __u64 pte_phys;
 -      __u64 pte_val;
 -};
 -
 -struct kvm_mmu_op_flush_tlb {
 -      struct kvm_mmu_op_header header;
 -};
 -
 -struct kvm_mmu_op_release_pt {
 -      struct kvm_mmu_op_header header;
 -      __u64 pt_phys;
 -};
 -
 -#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
 -#define KVM_PV_REASON_PAGE_READY 2
 -
 -struct kvm_vcpu_pv_apf_data {
 -      __u32 reason;
 -      __u8 pad[60];
 -      __u32 enabled;
 -};
 -
 -#define KVM_PV_EOI_BIT 0
 -#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
 -#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
 -#define KVM_PV_EOI_DISABLED 0x0
 -
 -#ifdef __KERNEL__
  #include <asm/processor.h>
 +#include <uapi/asm/kvm_para.h>
  
  extern void kvmclock_init(void);
  extern int kvm_register_clock(char *txt);
@@@ -27,7 -122,7 +27,7 @@@ static inline bool kvm_check_and_clear_
   *
   * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
   * The hypercall number should be placed in rax and the return value will be
-  * placed in rax.  No other registers will be clobbered unless explicited
+  * placed in rax.  No other registers will be clobbered unless explicitly
   * noted by the particular hypercall.
   */
  
@@@ -85,13 -180,13 +85,13 @@@ static inline long kvm_hypercall4(unsig
        return ret;
  }
  
 -static inline int kvm_para_available(void)
 +static inline bool kvm_para_available(void)
  {
        unsigned int eax, ebx, ecx, edx;
        char signature[13];
  
        if (boot_cpu_data.cpuid_level < 0)
 -              return 0;       /* So we don't blow up on old processors */
 +              return false;   /* So we don't blow up on old processors */
  
        if (cpu_has_hypervisor) {
                cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
                signature[12] = 0;
  
                if (strcmp(signature, "KVMKVMKVM") == 0)
 -                      return 1;
 +                      return true;
        }
  
 -      return 0;
 +      return false;
  }
  
  static inline unsigned int kvm_arch_para_features(void)
@@@ -133,4 -228,6 +133,4 @@@ static inline void kvm_disable_steal_ti
  }
  #endif
  
 -#endif /* __KERNEL__ */
 -
  #endif /* _ASM_X86_KVM_PARA_H */
index 235b49fa554bd614c6cb6b61c9c3618f1cd91a99,5c9dbadd364a3064072b491c1409216b852f3819..b6fbf860e398ed940bfb1113acd769a0afc827da
@@@ -1,3 -1,6 +1,3 @@@
 -#ifndef VMX_H
 -#define VMX_H
 -
  /*
   * vmx.h: VMX Architecture related definitions
   * Copyright (c) 2004, Intel Corporation.
   *    Yaniv Kamay <yaniv@qumranet.com>
   *
   */
 +#ifndef VMX_H
 +#define VMX_H
  
 -#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
 -
 -#define EXIT_REASON_EXCEPTION_NMI       0
 -#define EXIT_REASON_EXTERNAL_INTERRUPT  1
 -#define EXIT_REASON_TRIPLE_FAULT        2
 -
 -#define EXIT_REASON_PENDING_INTERRUPT   7
 -#define EXIT_REASON_NMI_WINDOW          8
 -#define EXIT_REASON_TASK_SWITCH         9
 -#define EXIT_REASON_CPUID               10
 -#define EXIT_REASON_HLT                 12
 -#define EXIT_REASON_INVD                13
 -#define EXIT_REASON_INVLPG              14
 -#define EXIT_REASON_RDPMC               15
 -#define EXIT_REASON_RDTSC               16
 -#define EXIT_REASON_VMCALL              18
 -#define EXIT_REASON_VMCLEAR             19
 -#define EXIT_REASON_VMLAUNCH            20
 -#define EXIT_REASON_VMPTRLD             21
 -#define EXIT_REASON_VMPTRST             22
 -#define EXIT_REASON_VMREAD              23
 -#define EXIT_REASON_VMRESUME            24
 -#define EXIT_REASON_VMWRITE             25
 -#define EXIT_REASON_VMOFF               26
 -#define EXIT_REASON_VMON                27
 -#define EXIT_REASON_CR_ACCESS           28
 -#define EXIT_REASON_DR_ACCESS           29
 -#define EXIT_REASON_IO_INSTRUCTION      30
 -#define EXIT_REASON_MSR_READ            31
 -#define EXIT_REASON_MSR_WRITE           32
 -#define EXIT_REASON_INVALID_STATE       33
 -#define EXIT_REASON_MWAIT_INSTRUCTION   36
 -#define EXIT_REASON_MONITOR_INSTRUCTION 39
 -#define EXIT_REASON_PAUSE_INSTRUCTION   40
 -#define EXIT_REASON_MCE_DURING_VMENTRY  41
 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 -#define EXIT_REASON_APIC_ACCESS         44
 -#define EXIT_REASON_EOI_INDUCED         45
 -#define EXIT_REASON_EPT_VIOLATION       48
 -#define EXIT_REASON_EPT_MISCONFIG       49
 -#define EXIT_REASON_WBINVD              54
 -#define EXIT_REASON_XSETBV              55
 -#define EXIT_REASON_APIC_WRITE          56
 -#define EXIT_REASON_INVPCID             58
 -
 -#define VMX_EXIT_REASONS \
 -      { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
 -      { EXIT_REASON_EXTERNAL_INTERRUPT,    "EXTERNAL_INTERRUPT" }, \
 -      { EXIT_REASON_TRIPLE_FAULT,          "TRIPLE_FAULT" }, \
 -      { EXIT_REASON_PENDING_INTERRUPT,     "PENDING_INTERRUPT" }, \
 -      { EXIT_REASON_NMI_WINDOW,            "NMI_WINDOW" }, \
 -      { EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
 -      { EXIT_REASON_CPUID,                 "CPUID" }, \
 -      { EXIT_REASON_HLT,                   "HLT" }, \
 -      { EXIT_REASON_INVLPG,                "INVLPG" }, \
 -      { EXIT_REASON_RDPMC,                 "RDPMC" }, \
 -      { EXIT_REASON_RDTSC,                 "RDTSC" }, \
 -      { EXIT_REASON_VMCALL,                "VMCALL" }, \
 -      { EXIT_REASON_VMCLEAR,               "VMCLEAR" }, \
 -      { EXIT_REASON_VMLAUNCH,              "VMLAUNCH" }, \
 -      { EXIT_REASON_VMPTRLD,               "VMPTRLD" }, \
 -      { EXIT_REASON_VMPTRST,               "VMPTRST" }, \
 -      { EXIT_REASON_VMREAD,                "VMREAD" }, \
 -      { EXIT_REASON_VMRESUME,              "VMRESUME" }, \
 -      { EXIT_REASON_VMWRITE,               "VMWRITE" }, \
 -      { EXIT_REASON_VMOFF,                 "VMOFF" }, \
 -      { EXIT_REASON_VMON,                  "VMON" }, \
 -      { EXIT_REASON_CR_ACCESS,             "CR_ACCESS" }, \
 -      { EXIT_REASON_DR_ACCESS,             "DR_ACCESS" }, \
 -      { EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
 -      { EXIT_REASON_MSR_READ,              "MSR_READ" }, \
 -      { EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
 -      { EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
 -      { EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
 -      { EXIT_REASON_PAUSE_INSTRUCTION,     "PAUSE_INSTRUCTION" }, \
 -      { EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
 -      { EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
 -      { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
 -      { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 -      { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
 -      { EXIT_REASON_WBINVD,                "WBINVD" }, \
 -      { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
 -      { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
 -      { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
 -      { EXIT_REASON_INVD,                  "INVD" }, \
 -      { EXIT_REASON_INVPCID,               "INVPCID" }
 -
 -#ifdef __KERNEL__
  
  #include <linux/types.h>
 +#include <uapi/asm/vmx.h>
  
  /*
   * Definitions of Primary Processor-Based VM-Execution Controls.
  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
  #define SECONDARY_EXEC_RDTSCP                 0x00000008
+ #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
  #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
  #define SECONDARY_EXEC_WBINVD_EXITING         0x00000040
  #define SECONDARY_EXEC_UNRESTRICTED_GUEST     0x00000080
+ #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
+ #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING     0x00000400
  #define SECONDARY_EXEC_ENABLE_INVPCID         0x00001000
  
@@@ -97,6 -188,7 +100,7 @@@ enum vmcs_field 
        GUEST_GS_SELECTOR               = 0x0000080a,
        GUEST_LDTR_SELECTOR             = 0x0000080c,
        GUEST_TR_SELECTOR               = 0x0000080e,
+       GUEST_INTR_STATUS               = 0x00000810,
        HOST_ES_SELECTOR                = 0x00000c00,
        HOST_CS_SELECTOR                = 0x00000c02,
        HOST_SS_SELECTOR                = 0x00000c04,
        APIC_ACCESS_ADDR_HIGH           = 0x00002015,
        EPT_POINTER                     = 0x0000201a,
        EPT_POINTER_HIGH                = 0x0000201b,
+       EOI_EXIT_BITMAP0                = 0x0000201c,
+       EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
+       EOI_EXIT_BITMAP1                = 0x0000201e,
+       EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
+       EOI_EXIT_BITMAP2                = 0x00002020,
+       EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
+       EOI_EXIT_BITMAP3                = 0x00002022,
+       EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
        GUEST_PHYSICAL_ADDRESS          = 0x00002400,
        GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
        VMCS_LINK_POINTER               = 0x00002800,
  
  #define AR_RESERVD_MASK 0xfffe0f00
  
- #define TSS_PRIVATE_MEMSLOT                   (KVM_MEMORY_SLOTS + 0)
- #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT      (KVM_MEMORY_SLOTS + 1)
- #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT    (KVM_MEMORY_SLOTS + 2)
+ #define TSS_PRIVATE_MEMSLOT                   (KVM_USER_MEM_SLOTS + 0)
+ #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT      (KVM_USER_MEM_SLOTS + 1)
+ #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT    (KVM_USER_MEM_SLOTS + 2)
  
  #define VMX_NR_VPIDS                          (1 << 16)
  #define VMX_VPID_EXTENT_SINGLE_CONTEXT                1
@@@ -445,3 -545,5 +457,3 @@@ enum vm_instruction_error_number 
  };
  
  #endif
 -
 -#endif
index 979d03bce135df3fe9bd9dc516cfaf449c22ea5e,0000000000000000000000000000000000000000..2871fccfee68619896f03d50ab4b93f75eaad8e3
mode 100644,000000..100644
--- /dev/null
@@@ -1,109 -1,0 +1,116 @@@
-       { EXIT_REASON_WBINVD,                "WBINVD" }
 +/*
 + * vmx.h: VMX Architecture related definitions
 + * Copyright (c) 2004, Intel Corporation.
 + *
 + * This program is free software; you can redistribute it and/or modify it
 + * under the terms and conditions of the GNU General Public License,
 + * version 2, as published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope it will be useful, but WITHOUT
 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 + * more details.
 + *
 + * You should have received a copy of the GNU General Public License along with
 + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 + * Place - Suite 330, Boston, MA 02111-1307 USA.
 + *
 + * A few random additions are:
 + * Copyright (C) 2006 Qumranet
 + *    Avi Kivity <avi@qumranet.com>
 + *    Yaniv Kamay <yaniv@qumranet.com>
 + *
 + */
 +#ifndef _UAPIVMX_H
 +#define _UAPIVMX_H
 +
 +
 +#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
 +
 +#define EXIT_REASON_EXCEPTION_NMI       0
 +#define EXIT_REASON_EXTERNAL_INTERRUPT  1
 +#define EXIT_REASON_TRIPLE_FAULT        2
 +
 +#define EXIT_REASON_PENDING_INTERRUPT   7
 +#define EXIT_REASON_NMI_WINDOW          8
 +#define EXIT_REASON_TASK_SWITCH         9
 +#define EXIT_REASON_CPUID               10
 +#define EXIT_REASON_HLT                 12
 +#define EXIT_REASON_INVD                13
 +#define EXIT_REASON_INVLPG              14
 +#define EXIT_REASON_RDPMC               15
 +#define EXIT_REASON_RDTSC               16
 +#define EXIT_REASON_VMCALL              18
 +#define EXIT_REASON_VMCLEAR             19
 +#define EXIT_REASON_VMLAUNCH            20
 +#define EXIT_REASON_VMPTRLD             21
 +#define EXIT_REASON_VMPTRST             22
 +#define EXIT_REASON_VMREAD              23
 +#define EXIT_REASON_VMRESUME            24
 +#define EXIT_REASON_VMWRITE             25
 +#define EXIT_REASON_VMOFF               26
 +#define EXIT_REASON_VMON                27
 +#define EXIT_REASON_CR_ACCESS           28
 +#define EXIT_REASON_DR_ACCESS           29
 +#define EXIT_REASON_IO_INSTRUCTION      30
 +#define EXIT_REASON_MSR_READ            31
 +#define EXIT_REASON_MSR_WRITE           32
 +#define EXIT_REASON_INVALID_STATE       33
 +#define EXIT_REASON_MWAIT_INSTRUCTION   36
 +#define EXIT_REASON_MONITOR_INSTRUCTION 39
 +#define EXIT_REASON_PAUSE_INSTRUCTION   40
 +#define EXIT_REASON_MCE_DURING_VMENTRY  41
 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 +#define EXIT_REASON_APIC_ACCESS         44
++#define EXIT_REASON_EOI_INDUCED         45
 +#define EXIT_REASON_EPT_VIOLATION       48
 +#define EXIT_REASON_EPT_MISCONFIG       49
 +#define EXIT_REASON_WBINVD              54
 +#define EXIT_REASON_XSETBV              55
++#define EXIT_REASON_APIC_WRITE          56
 +#define EXIT_REASON_INVPCID             58
 +
 +#define VMX_EXIT_REASONS \
 +      { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
 +      { EXIT_REASON_EXTERNAL_INTERRUPT,    "EXTERNAL_INTERRUPT" }, \
 +      { EXIT_REASON_TRIPLE_FAULT,          "TRIPLE_FAULT" }, \
 +      { EXIT_REASON_PENDING_INTERRUPT,     "PENDING_INTERRUPT" }, \
 +      { EXIT_REASON_NMI_WINDOW,            "NMI_WINDOW" }, \
 +      { EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
 +      { EXIT_REASON_CPUID,                 "CPUID" }, \
 +      { EXIT_REASON_HLT,                   "HLT" }, \
 +      { EXIT_REASON_INVLPG,                "INVLPG" }, \
 +      { EXIT_REASON_RDPMC,                 "RDPMC" }, \
 +      { EXIT_REASON_RDTSC,                 "RDTSC" }, \
 +      { EXIT_REASON_VMCALL,                "VMCALL" }, \
 +      { EXIT_REASON_VMCLEAR,               "VMCLEAR" }, \
 +      { EXIT_REASON_VMLAUNCH,              "VMLAUNCH" }, \
 +      { EXIT_REASON_VMPTRLD,               "VMPTRLD" }, \
 +      { EXIT_REASON_VMPTRST,               "VMPTRST" }, \
 +      { EXIT_REASON_VMREAD,                "VMREAD" }, \
 +      { EXIT_REASON_VMRESUME,              "VMRESUME" }, \
 +      { EXIT_REASON_VMWRITE,               "VMWRITE" }, \
 +      { EXIT_REASON_VMOFF,                 "VMOFF" }, \
 +      { EXIT_REASON_VMON,                  "VMON" }, \
 +      { EXIT_REASON_CR_ACCESS,             "CR_ACCESS" }, \
 +      { EXIT_REASON_DR_ACCESS,             "DR_ACCESS" }, \
 +      { EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
 +      { EXIT_REASON_MSR_READ,              "MSR_READ" }, \
 +      { EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
 +      { EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
 +      { EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
 +      { EXIT_REASON_PAUSE_INSTRUCTION,     "PAUSE_INSTRUCTION" }, \
 +      { EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
 +      { EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
 +      { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
 +      { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 +      { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
++      { EXIT_REASON_WBINVD,                "WBINVD" }, \
++      { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
++      { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
++      { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
++      { EXIT_REASON_INVD,                  "INVD" }, \
++      { EXIT_REASON_INVPCID,               "INVPCID" }
 +
 +
 +#endif /* _UAPIVMX_H */
index 9f966dc0b9e4aaf9fc882687dd11d8f64682e299,5bedbdddf1f2e3383f8a6fc82f81cdb2ecdcd1ad..0732f0089a3df2d0bcbde6b397fc8c3e1e76844c
@@@ -162,8 -162,8 +162,8 @@@ int kvm_register_clock(char *txt
        int low, high, ret;
        struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
  
 -      low = (int)__pa(src) | 1;
 -      high = ((u64)__pa(src) >> 32);
 +      low = (int)slow_virt_to_phys(src) | 1;
 +      high = ((u64)slow_virt_to_phys(src) >> 32);
        ret = native_write_msr_safe(msr_kvm_system_time, low, high);
        printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
               cpu, high, low, txt);
@@@ -218,6 -218,9 +218,9 @@@ static void kvm_shutdown(void
  void __init kvmclock_init(void)
  {
        unsigned long mem;
+       int size;
+       size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
  
        if (!kvm_para_available())
                return;
        printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
                msr_kvm_system_time, msr_kvm_wall_clock);
  
-       mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
-                            PAGE_SIZE);
+       mem = memblock_alloc(size, PAGE_SIZE);
        if (!mem)
                return;
        hv_clock = __va(mem);
  
        if (kvm_register_clock("boot clock")) {
                hv_clock = NULL;
-               memblock_free(mem,
-                       sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
+               memblock_free(mem, size);
                return;
        }
        pv_time_ops.sched_clock = kvm_clock_read;
@@@ -275,7 -276,7 +276,7 @@@ int __init kvm_setup_vsyscall_timeinfo(
        struct pvclock_vcpu_time_info *vcpu_time;
        unsigned int size;
  
-       size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
+       size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
  
        preempt_disable();
        cpu = smp_processor_id();
diff --combined arch/x86/kvm/x86.c
index 37040079cd6bec44276a4161b84e120fe923293f,3c5bb6fe52804c59c31c7304757fa2aa92dc0d1b..f71500af1f813245bb12092665ac7dea3ba5f24f
@@@ -120,7 -120,7 +120,7 @@@ struct kvm_shared_msrs 
  };
  
  static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 -static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
 +static struct kvm_shared_msrs __percpu *shared_msrs;
  
  struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
@@@ -191,10 -191,10 +191,10 @@@ static void kvm_on_user_return(struct u
  
  static void shared_msr_update(unsigned slot, u32 msr)
  {
 -      struct kvm_shared_msrs *smsr;
        u64 value;
 +      unsigned int cpu = smp_processor_id();
 +      struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
  
 -      smsr = &__get_cpu_var(shared_msrs);
        /* only read, and nobody should modify it at this time,
         * so don't need lock */
        if (slot >= shared_msrs_global.nr) {
@@@ -226,8 -226,7 +226,8 @@@ static void kvm_shared_msr_cpu_online(v
  
  void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
  {
 -      struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 +      unsigned int cpu = smp_processor_id();
 +      struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
  
        if (((value ^ smsr->values[slot].curr) & mask) == 0)
                return;
@@@ -243,8 -242,7 +243,8 @@@ EXPORT_SYMBOL_GPL(kvm_set_shared_msr)
  
  static void drop_user_return_notifiers(void *ignore)
  {
 -      struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 +      unsigned int cpu = smp_processor_id();
 +      struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
  
        if (smsr->registered)
                kvm_on_user_return(&smsr->urn);
@@@ -872,8 -870,6 +872,6 @@@ static int set_efer(struct kvm_vcpu *vc
  
        kvm_x86_ops->set_efer(vcpu, efer);
  
-       vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
        /* Update reserved bits */
        if ((efer ^ old_efer) & EFER_NX)
                kvm_mmu_reset_context(vcpu);
@@@ -1881,14 -1877,6 +1879,14 @@@ int kvm_set_msr_common(struct kvm_vcpu 
        u64 data = msr_info->data;
  
        switch (msr) {
 +      case MSR_AMD64_NB_CFG:
 +      case MSR_IA32_UCODE_REV:
 +      case MSR_IA32_UCODE_WRITE:
 +      case MSR_VM_HSAVE_PA:
 +      case MSR_AMD64_PATCH_LOADER:
 +      case MSR_AMD64_BU_CFG2:
 +              break;
 +
        case MSR_EFER:
                return set_efer(vcpu, data);
        case MSR_K7_HWCR:
                        return 1;
                }
                break;
 -      case MSR_AMD64_NB_CFG:
 -              break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!data) {
                        /* We support the non-activated case already */
                vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
                            __func__, data);
                break;
 -      case MSR_IA32_UCODE_REV:
 -      case MSR_IA32_UCODE_WRITE:
 -      case MSR_VM_HSAVE_PA:
 -      case MSR_AMD64_PATCH_LOADER:
 -              break;
        case 0x200 ... 0x2ff:
                return set_msr_mtrr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
@@@ -2254,7 -2249,6 +2252,7 @@@ int kvm_get_msr_common(struct kvm_vcpu 
        case MSR_K8_INT_PENDING_MSG:
        case MSR_AMD64_NB_CFG:
        case MSR_FAM10H_MMIO_CONF_BASE:
 +      case MSR_AMD64_BU_CFG2:
                data = 0;
                break;
        case MSR_P6_PERFCTR0:
@@@ -2522,7 -2516,7 +2520,7 @@@ int kvm_dev_ioctl_check_extension(long 
                r = KVM_MAX_VCPUS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
-               r = KVM_MEMORY_SLOTS;
+               r = KVM_USER_MEM_SLOTS;
                break;
        case KVM_CAP_PV_MMU:    /* obsolete */
                r = 0;
@@@ -3274,12 -3268,10 +3272,10 @@@ static int kvm_vm_ioctl_set_nr_mmu_page
                return -EINVAL;
  
        mutex_lock(&kvm->slots_lock);
-       spin_lock(&kvm->mmu_lock);
  
        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
  
-       spin_unlock(&kvm->mmu_lock);
        mutex_unlock(&kvm->slots_lock);
        return 0;
  }
@@@ -3439,7 -3431,7 +3435,7 @@@ int kvm_vm_ioctl_get_dirty_log(struct k
        mutex_lock(&kvm->slots_lock);
  
        r = -EINVAL;
-       if (log->slot >= KVM_MEMORY_SLOTS)
+       if (log->slot >= KVM_USER_MEM_SLOTS)
                goto out;
  
        memslot = id_to_memslot(kvm->memslots, log->slot);
@@@ -4495,8 -4487,10 +4491,10 @@@ static bool emulator_get_segment(struc
        kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
        *selector = var.selector;
  
-       if (var.unusable)
+       if (var.unusable) {
+               memset(desc, 0, sizeof(*desc));
                return false;
+       }
  
        if (var.g)
                var.limit >>= 12;
@@@ -4757,26 -4751,26 +4755,26 @@@ static int handle_emulation_failure(str
        return r;
  }
  
- static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+                                 bool write_fault_to_shadow_pgtable)
  {
-       gpa_t gpa;
+       gpa_t gpa = cr2;
        pfn_t pfn;
  
-       if (tdp_enabled)
-               return false;
-       /*
-        * if emulation was due to access to shadowed page table
-        * and it failed try to unshadow page and re-enter the
-        * guest to let CPU execute the instruction.
-        */
-       if (kvm_mmu_unprotect_page_virt(vcpu, gva))
-               return true;
-       gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+       if (!vcpu->arch.mmu.direct_map) {
+               /*
+                * Write permission should be allowed since only
+                * write access need to be emulated.
+                */
+               gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
  
-       if (gpa == UNMAPPED_GVA)
-               return true; /* let cpu generate fault */
+               /*
+                * If the mapping is invalid in guest, let cpu retry
+                * it to generate fault.
+                */
+               if (gpa == UNMAPPED_GVA)
+                       return true;
+       }
  
        /*
         * Do not retry the unhandleable instruction if it faults on the
         * instruction -> ...
         */
        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-       if (!is_error_noslot_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
+       /*
+        * If the instruction failed on the error pfn, it can not be fixed,
+        * report the error to userspace.
+        */
+       if (is_error_noslot_pfn(pfn))
+               return false;
+       kvm_release_pfn_clean(pfn);
+       /* The instructions are well-emulated on direct mmu. */
+       if (vcpu->arch.mmu.direct_map) {
+               unsigned int indirect_shadow_pages;
+               spin_lock(&vcpu->kvm->mmu_lock);
+               indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+               spin_unlock(&vcpu->kvm->mmu_lock);
+               if (indirect_shadow_pages)
+                       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
                return true;
        }
  
-       return false;
+       /*
+        * if emulation was due to access to shadowed page table
+        * and it failed try to unshadow page and re-enter the
+        * guest to let CPU execute the instruction.
+        */
+       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+       /*
+        * If the access faults on its page table, it can not
+        * be fixed by unprotecting shadow page and it should
+        * be reported to userspace.
+        */
+       return !write_fault_to_shadow_pgtable;
  }
  
  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
        if (!vcpu->arch.mmu.direct_map)
                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
  
-       kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
  
        return true;
  }
@@@ -4849,7 -4874,13 +4878,13 @@@ int x86_emulate_instruction(struct kvm_
        int r;
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        bool writeback = true;
+       bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
  
+       /*
+        * Clear write_fault_to_shadow_pgtable here to ensure it is
+        * never reused.
+        */
+       vcpu->arch.write_fault_to_shadow_pgtable = false;
        kvm_clear_exception_queue(vcpu);
  
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                if (r != EMULATION_OK)  {
                        if (emulation_type & EMULTYPE_TRAP_UD)
                                return EMULATE_FAIL;
-                       if (reexecute_instruction(vcpu, cr2))
+                       if (reexecute_instruction(vcpu, cr2,
+                                                 write_fault_to_spt))
                                return EMULATE_DONE;
                        if (emulation_type & EMULTYPE_SKIP)
                                return EMULATE_FAIL;
@@@ -4898,7 -4930,7 +4934,7 @@@ restart
                return EMULATE_DONE;
  
        if (r == EMULATION_FAILED) {
-               if (reexecute_instruction(vcpu, cr2))
+               if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
                        return EMULATE_DONE;
  
                return handle_emulation_failure(vcpu);
@@@ -5237,16 -5269,9 +5273,16 @@@ int kvm_arch_init(void *opaque
                goto out;
        }
  
 +      r = -ENOMEM;
 +      shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 +      if (!shared_msrs) {
 +              printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
 +              goto out;
 +      }
 +
        r = kvm_mmu_module_init();
        if (r)
 -              goto out;
 +              goto out_free_percpu;
  
        kvm_set_mmio_spte_mask();
        kvm_init_msr_list();
  
        return 0;
  
 +out_free_percpu:
 +      free_percpu(shared_msrs);
  out:
        return r;
  }
@@@ -5288,7 -5311,6 +5324,7 @@@ void kvm_arch_exit(void
  #endif
        kvm_x86_ops = NULL;
        kvm_mmu_module_exit();
 +      free_percpu(shared_msrs);
  }
  
  int kvm_emulate_halt(struct kvm_vcpu *vcpu)
@@@ -5541,7 -5563,7 +5577,7 @@@ static void inject_pending_event(struc
                        vcpu->arch.nmi_injected = true;
                        kvm_x86_ops->set_nmi(vcpu);
                }
-       } else if (kvm_cpu_has_interrupt(vcpu)) {
+       } else if (kvm_cpu_has_injectable_intr(vcpu)) {
                if (kvm_x86_ops->interrupt_allowed(vcpu)) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
                                            false);
@@@ -5609,6 -5631,16 +5645,16 @@@ static void kvm_gen_update_masterclock(
  #endif
  }
  
+ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+ {
+       u64 eoi_exit_bitmap[4];
+       memset(eoi_exit_bitmap, 0, 32);
+       kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+       kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ }
  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  {
        int r;
                        kvm_handle_pmu_event(vcpu);
                if (kvm_check_request(KVM_REQ_PMI, vcpu))
                        kvm_deliver_pmi(vcpu);
+               if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
+                       update_eoi_exitmap(vcpu);
        }
  
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
                /* enable NMI/IRQ window open exits if needed */
                if (vcpu->arch.nmi_pending)
                        kvm_x86_ops->enable_nmi_window(vcpu);
-               else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+               else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
                        kvm_x86_ops->enable_irq_window(vcpu);
  
                if (kvm_lapic_enabled(vcpu)) {
+                       /*
+                        * Update architecture specific hints for APIC
+                        * virtual interrupt delivery.
+                        */
+                       if (kvm_x86_ops->hwapic_irr_update)
+                               kvm_x86_ops->hwapic_irr_update(vcpu,
+                                       kvm_lapic_find_highest_irr(vcpu));
                        update_cr8_intercept(vcpu);
                        kvm_lapic_sync_to_vapic(vcpu);
                }
@@@ -6853,48 -6894,43 +6908,43 @@@ int kvm_arch_prepare_memory_region(stru
                                struct kvm_memory_slot *memslot,
                                struct kvm_memory_slot old,
                                struct kvm_userspace_memory_region *mem,
-                               int user_alloc)
+                               bool user_alloc)
  {
        int npages = memslot->npages;
-       int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
  
-       /* Prevent internal slot pages from being moved by fork()/COW. */
-       if (memslot->id >= KVM_MEMORY_SLOTS)
-               map_flags = MAP_SHARED | MAP_ANONYMOUS;
-       /*To keep backward compatibility with older userspace,
-        *x86 needs to handle !user_alloc case.
+       /*
+        * Only private memory slots need to be mapped here since
+        * KVM_SET_MEMORY_REGION ioctl is no longer supported.
         */
-       if (!user_alloc) {
-               if (npages && !old.npages) {
-                       unsigned long userspace_addr;
+       if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+               unsigned long userspace_addr;
  
-                       userspace_addr = vm_mmap(NULL, 0,
-                                                npages * PAGE_SIZE,
-                                                PROT_READ | PROT_WRITE,
-                                                map_flags,
-                                                0);
+               /*
+                * MAP_SHARED to prevent internal slot pages from being moved
+                * by fork()/COW.
+                */
+               userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+                                        PROT_READ | PROT_WRITE,
+                                        MAP_SHARED | MAP_ANONYMOUS, 0);
  
-                       if (IS_ERR((void *)userspace_addr))
-                               return PTR_ERR((void *)userspace_addr);
+               if (IS_ERR((void *)userspace_addr))
+                       return PTR_ERR((void *)userspace_addr);
  
-                       memslot->userspace_addr = userspace_addr;
-               }
+               memslot->userspace_addr = userspace_addr;
        }
  
        return 0;
  }
  
  void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
                                struct kvm_memory_slot old,
-                               int user_alloc)
+                               bool user_alloc)
  {
  
        int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
  
-       if (!user_alloc && !old.user_alloc && old.npages && !npages) {
+       if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
                int ret;
  
                ret = vm_munmap(old.userspace_addr,
        if (!kvm->arch.n_requested_mmu_pages)
                nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
  
-       spin_lock(&kvm->mmu_lock);
        if (nr_mmu_pages)
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
-       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-       spin_unlock(&kvm->mmu_lock);
+       /*
+        * Write protect all pages for dirty logging.
+        * Existing largepage mappings are destroyed here and new ones will
+        * not be created until the end of the logging.
+        */
+       if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+               kvm_mmu_slot_remove_write_access(kvm, mem->slot);
        /*
         * If memory slot is created, or moved, we need to clear all
         * mmio sptes.
index 8491111aec12d45b780243a907f71bb86adbfe17,b846b6c4130ab6b292c672cf7915e9002752ed39..03a15e016778c60b246b40ae173fd6dd9866935f
@@@ -392,7 -392,7 +392,7 @@@ static void kvm_extint_handler(struct e
  
        if ((ext_code.subcode & 0xff00) != VIRTIO_SUBCODE_64)
                return;
 -      kstat_cpu(smp_processor_id()).irqs[EXTINT_VRT]++;
 +      inc_irq_stat(IRQEXT_VRT);
  
        /* The LSB might be overloaded, we have to mask it */
        vq = (struct virtqueue *)(param64 & ~1UL);
        }
  }
  
+ /*
+  * For s390-virtio, we expect a page above main storage containing
+  * the virtio configuration. Try to actually load from this area
+  * in order to figure out if the host provides this page.
+  */
+ static int __init test_devices_support(unsigned long addr)
+ {
+       int ret = -EIO;
+       asm volatile(
+               "0:     lura    0,%1\n"
+               "1:     xgr     %0,%0\n"
+               "2:\n"
+               EX_TABLE(0b,2b)
+               EX_TABLE(1b,2b)
+               : "+d" (ret)
+               : "a" (addr)
+               : "0", "cc");
+       return ret;
+ }
  /*
   * Init function for virtio
   * devices are in a single page above top of "normal" mem
@@@ -432,21 -452,23 +452,23 @@@ static int __init kvm_devices_init(void
        if (!MACHINE_IS_KVM)
                return -ENODEV;
  
+       if (test_devices_support(real_memory_size) < 0)
+               return -ENODEV;
+       rc = vmem_add_mapping(real_memory_size, PAGE_SIZE);
+       if (rc)
+               return rc;
+       kvm_devices = (void *) real_memory_size;
        kvm_root = root_device_register("kvm_s390");
        if (IS_ERR(kvm_root)) {
                rc = PTR_ERR(kvm_root);
                printk(KERN_ERR "Could not register kvm_s390 root device");
+               vmem_remove_mapping(real_memory_size, PAGE_SIZE);
                return rc;
        }
  
-       rc = vmem_add_mapping(real_memory_size, PAGE_SIZE);
-       if (rc) {
-               root_device_unregister(kvm_root);
-               return rc;
-       }
-       kvm_devices = (void *) real_memory_size;
        INIT_WORK(&hotplug_work, hotplug_devices);
  
        service_subclass_irq_register();
index 0000000000000000000000000000000000000000,3217dfe5cb8b24d71029d5459da6c5886a6ba6ef..2029b6caa5956c2b4a01044b83325a0de16e3104
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,926 +1,926 @@@
 -      .int_class = IOINT_VIR,
+ /*
+  * ccw based virtio transport
+  *
+  * Copyright IBM Corp. 2012
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License (version 2 only)
+  * as published by the Free Software Foundation.
+  *
+  *    Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
+  */
+ #include <linux/kernel_stat.h>
+ #include <linux/init.h>
+ #include <linux/bootmem.h>
+ #include <linux/err.h>
+ #include <linux/virtio.h>
+ #include <linux/virtio_config.h>
+ #include <linux/slab.h>
+ #include <linux/interrupt.h>
+ #include <linux/virtio_ring.h>
+ #include <linux/pfn.h>
+ #include <linux/async.h>
+ #include <linux/wait.h>
+ #include <linux/list.h>
+ #include <linux/bitops.h>
+ #include <linux/module.h>
+ #include <linux/io.h>
+ #include <linux/kvm_para.h>
+ #include <asm/setup.h>
+ #include <asm/irq.h>
+ #include <asm/cio.h>
+ #include <asm/ccwdev.h>
+ /*
+  * virtio related functions
+  */
+ struct vq_config_block {
+       __u16 index;
+       __u16 num;
+ } __packed;
+ #define VIRTIO_CCW_CONFIG_SIZE 0x100
+ /* same as PCI config space size, should be enough for all drivers */
+ struct virtio_ccw_device {
+       struct virtio_device vdev;
+       __u8 *status;
+       __u8 config[VIRTIO_CCW_CONFIG_SIZE];
+       struct ccw_device *cdev;
+       __u32 curr_io;
+       int err;
+       wait_queue_head_t wait_q;
+       spinlock_t lock;
+       struct list_head virtqueues;
+       unsigned long indicators;
+       unsigned long indicators2;
+       struct vq_config_block *config_block;
+ };
+ struct vq_info_block {
+       __u64 queue;
+       __u32 align;
+       __u16 index;
+       __u16 num;
+ } __packed;
+ struct virtio_feature_desc {
+       __u32 features;
+       __u8 index;
+ } __packed;
+ struct virtio_ccw_vq_info {
+       struct virtqueue *vq;
+       int num;
+       void *queue;
+       struct vq_info_block *info_block;
+       struct list_head node;
+ };
+ #define KVM_VIRTIO_CCW_RING_ALIGN 4096
+ #define KVM_S390_VIRTIO_CCW_NOTIFY 3
+ #define CCW_CMD_SET_VQ 0x13
+ #define CCW_CMD_VDEV_RESET 0x33
+ #define CCW_CMD_SET_IND 0x43
+ #define CCW_CMD_SET_CONF_IND 0x53
+ #define CCW_CMD_READ_FEAT 0x12
+ #define CCW_CMD_WRITE_FEAT 0x11
+ #define CCW_CMD_READ_CONF 0x22
+ #define CCW_CMD_WRITE_CONF 0x21
+ #define CCW_CMD_WRITE_STATUS 0x31
+ #define CCW_CMD_READ_VQ_CONF 0x32
+ #define VIRTIO_CCW_DOING_SET_VQ 0x00010000
+ #define VIRTIO_CCW_DOING_RESET 0x00040000
+ #define VIRTIO_CCW_DOING_READ_FEAT 0x00080000
+ #define VIRTIO_CCW_DOING_WRITE_FEAT 0x00100000
+ #define VIRTIO_CCW_DOING_READ_CONFIG 0x00200000
+ #define VIRTIO_CCW_DOING_WRITE_CONFIG 0x00400000
+ #define VIRTIO_CCW_DOING_WRITE_STATUS 0x00800000
+ #define VIRTIO_CCW_DOING_SET_IND 0x01000000
+ #define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000
+ #define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000
+ #define VIRTIO_CCW_INTPARM_MASK 0xffff0000
+ static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
+ {
+       return container_of(vdev, struct virtio_ccw_device, vdev);
+ }
+ static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag)
+ {
+       unsigned long flags;
+       __u32 ret;
+       spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
+       if (vcdev->err)
+               ret = 0;
+       else
+               ret = vcdev->curr_io & flag;
+       spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
+       return ret;
+ }
+ static int ccw_io_helper(struct virtio_ccw_device *vcdev,
+                        struct ccw1 *ccw, __u32 intparm)
+ {
+       int ret;
+       unsigned long flags;
+       int flag = intparm & VIRTIO_CCW_INTPARM_MASK;
+       do {
+               spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
+               ret = ccw_device_start(vcdev->cdev, ccw, intparm, 0, 0);
+               if (!ret)
+                       vcdev->curr_io |= flag;
+               spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
+               cpu_relax();
+       } while (ret == -EBUSY);
+       wait_event(vcdev->wait_q, doing_io(vcdev, flag) == 0);
+       return ret ? ret : vcdev->err;
+ }
+ static inline long do_kvm_notify(struct subchannel_id schid,
+                                unsigned long queue_index)
+ {
+       register unsigned long __nr asm("1") = KVM_S390_VIRTIO_CCW_NOTIFY;
+       register struct subchannel_id __schid asm("2") = schid;
+       register unsigned long __index asm("3") = queue_index;
+       register long __rc asm("2");
+       asm volatile ("diag 2,4,0x500\n"
+                     : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index)
+                     : "memory", "cc");
+       return __rc;
+ }
+ static void virtio_ccw_kvm_notify(struct virtqueue *vq)
+ {
+       struct virtio_ccw_vq_info *info = vq->priv;
+       struct virtio_ccw_device *vcdev;
+       struct subchannel_id schid;
+       vcdev = to_vc_device(info->vq->vdev);
+       ccw_device_get_schid(vcdev->cdev, &schid);
+       do_kvm_notify(schid, virtqueue_get_queue_index(vq));
+ }
+ static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
+                                  struct ccw1 *ccw, int index)
+ {
+       vcdev->config_block->index = index;
+       ccw->cmd_code = CCW_CMD_READ_VQ_CONF;
+       ccw->flags = 0;
+       ccw->count = sizeof(struct vq_config_block);
+       ccw->cda = (__u32)(unsigned long)(vcdev->config_block);
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_VQ_CONF);
+       return vcdev->config_block->num;
+ }
+ static void virtio_ccw_del_vq(struct virtqueue *vq, struct ccw1 *ccw)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vq->vdev);
+       struct virtio_ccw_vq_info *info = vq->priv;
+       unsigned long flags;
+       unsigned long size;
+       int ret;
+       unsigned int index = virtqueue_get_queue_index(vq);
+       /* Remove from our list. */
+       spin_lock_irqsave(&vcdev->lock, flags);
+       list_del(&info->node);
+       spin_unlock_irqrestore(&vcdev->lock, flags);
+       /* Release from host. */
+       info->info_block->queue = 0;
+       info->info_block->align = 0;
+       info->info_block->index = index;
+       info->info_block->num = 0;
+       ccw->cmd_code = CCW_CMD_SET_VQ;
+       ccw->flags = 0;
+       ccw->count = sizeof(*info->info_block);
+       ccw->cda = (__u32)(unsigned long)(info->info_block);
+       ret = ccw_io_helper(vcdev, ccw,
+                           VIRTIO_CCW_DOING_SET_VQ | index);
+       /*
+        * -ENODEV isn't considered an error: The device is gone anyway.
+        * This may happen on device detach.
+        */
+       if (ret && (ret != -ENODEV))
+               dev_warn(&vq->vdev->dev, "Error %d while deleting queue %d",
+                        ret, index);
+       vring_del_virtqueue(vq);
+       size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
+       free_pages_exact(info->queue, size);
+       kfree(info->info_block);
+       kfree(info);
+ }
+ static void virtio_ccw_del_vqs(struct virtio_device *vdev)
+ {
+       struct virtqueue *vq, *n;
+       struct ccw1 *ccw;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+       list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+               virtio_ccw_del_vq(vq, ccw);
+       kfree(ccw);
+ }
+ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
+                                            int i, vq_callback_t *callback,
+                                            const char *name,
+                                            struct ccw1 *ccw)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       int err;
+       struct virtqueue *vq = NULL;
+       struct virtio_ccw_vq_info *info;
+       unsigned long size = 0; /* silence the compiler */
+       unsigned long flags;
+       /* Allocate queue. */
+       info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL);
+       if (!info) {
+               dev_warn(&vcdev->cdev->dev, "no info\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+       info->info_block = kzalloc(sizeof(*info->info_block),
+                                  GFP_DMA | GFP_KERNEL);
+       if (!info->info_block) {
+               dev_warn(&vcdev->cdev->dev, "no info block\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+       info->num = virtio_ccw_read_vq_conf(vcdev, ccw, i);
+       size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
+       info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
+       if (info->queue == NULL) {
+               dev_warn(&vcdev->cdev->dev, "no queue\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+       vq = vring_new_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev,
+                                true, info->queue, virtio_ccw_kvm_notify,
+                                callback, name);
+       if (!vq) {
+               /* For now, we fail if we can't get the requested size. */
+               dev_warn(&vcdev->cdev->dev, "no vq\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+       /* Register it with the host. */
+       info->info_block->queue = (__u64)info->queue;
+       info->info_block->align = KVM_VIRTIO_CCW_RING_ALIGN;
+       info->info_block->index = i;
+       info->info_block->num = info->num;
+       ccw->cmd_code = CCW_CMD_SET_VQ;
+       ccw->flags = 0;
+       ccw->count = sizeof(*info->info_block);
+       ccw->cda = (__u32)(unsigned long)(info->info_block);
+       err = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_VQ | i);
+       if (err) {
+               dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n");
+               goto out_err;
+       }
+       info->vq = vq;
+       vq->priv = info;
+       /* Save it to our list. */
+       spin_lock_irqsave(&vcdev->lock, flags);
+       list_add(&info->node, &vcdev->virtqueues);
+       spin_unlock_irqrestore(&vcdev->lock, flags);
+       return vq;
+ out_err:
+       if (vq)
+               vring_del_virtqueue(vq);
+       if (info) {
+               if (info->queue)
+                       free_pages_exact(info->queue, size);
+               kfree(info->info_block);
+       }
+       kfree(info);
+       return ERR_PTR(err);
+ }
+ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+                              struct virtqueue *vqs[],
+                              vq_callback_t *callbacks[],
+                              const char *names[])
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       unsigned long *indicatorp = NULL;
+       int ret, i;
+       struct ccw1 *ccw;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return -ENOMEM;
+       for (i = 0; i < nvqs; ++i) {
+               vqs[i] = virtio_ccw_setup_vq(vdev, i, callbacks[i], names[i],
+                                            ccw);
+               if (IS_ERR(vqs[i])) {
+                       ret = PTR_ERR(vqs[i]);
+                       vqs[i] = NULL;
+                       goto out;
+               }
+       }
+       ret = -ENOMEM;
+       /* We need a data area under 2G to communicate. */
+       indicatorp = kmalloc(sizeof(&vcdev->indicators), GFP_DMA | GFP_KERNEL);
+       if (!indicatorp)
+               goto out;
+       *indicatorp = (unsigned long) &vcdev->indicators;
+       /* Register queue indicators with host. */
+       vcdev->indicators = 0;
+       ccw->cmd_code = CCW_CMD_SET_IND;
+       ccw->flags = 0;
+       ccw->count = sizeof(vcdev->indicators);
+       ccw->cda = (__u32)(unsigned long) indicatorp;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
+       if (ret)
+               goto out;
+       /* Register indicators2 with host for config changes */
+       *indicatorp = (unsigned long) &vcdev->indicators2;
+       vcdev->indicators2 = 0;
+       ccw->cmd_code = CCW_CMD_SET_CONF_IND;
+       ccw->flags = 0;
+       ccw->count = sizeof(vcdev->indicators2);
+       ccw->cda = (__u32)(unsigned long) indicatorp;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_CONF_IND);
+       if (ret)
+               goto out;
+       kfree(indicatorp);
+       kfree(ccw);
+       return 0;
+ out:
+       kfree(indicatorp);
+       kfree(ccw);
+       virtio_ccw_del_vqs(vdev);
+       return ret;
+ }
+ static void virtio_ccw_reset(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct ccw1 *ccw;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+       /* Zero status bits. */
+       *vcdev->status = 0;
+       /* Send a reset ccw on device. */
+       ccw->cmd_code = CCW_CMD_VDEV_RESET;
+       ccw->flags = 0;
+       ccw->count = 0;
+       ccw->cda = 0;
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_RESET);
+       kfree(ccw);
+ }
+ static u32 virtio_ccw_get_features(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct virtio_feature_desc *features;
+       int ret, rc;
+       struct ccw1 *ccw;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return 0;
+       features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL);
+       if (!features) {
+               rc = 0;
+               goto out_free;
+       }
+       /* Read the feature bits from the host. */
+       /* TODO: Features > 32 bits */
+       features->index = 0;
+       ccw->cmd_code = CCW_CMD_READ_FEAT;
+       ccw->flags = 0;
+       ccw->count = sizeof(*features);
+       ccw->cda = (__u32)(unsigned long)features;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_FEAT);
+       if (ret) {
+               rc = 0;
+               goto out_free;
+       }
+       rc = le32_to_cpu(features->features);
+ out_free:
+       kfree(features);
+       kfree(ccw);
+       return rc;
+ }
+ static void virtio_ccw_finalize_features(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct virtio_feature_desc *features;
+       int i;
+       struct ccw1 *ccw;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+       features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL);
+       if (!features)
+               goto out_free;
+       /* Give virtio_ring a chance to accept features. */
+       vring_transport_features(vdev);
+       for (i = 0; i < sizeof(*vdev->features) / sizeof(features->features);
+            i++) {
+               int highbits = i % 2 ? 32 : 0;
+               features->index = i;
+               features->features = cpu_to_le32(vdev->features[i / 2]
+                                                >> highbits);
+               /* Write the feature bits to the host. */
+               ccw->cmd_code = CCW_CMD_WRITE_FEAT;
+               ccw->flags = 0;
+               ccw->count = sizeof(*features);
+               ccw->cda = (__u32)(unsigned long)features;
+               ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_FEAT);
+       }
+ out_free:
+       kfree(features);
+       kfree(ccw);
+ }
+ static void virtio_ccw_get_config(struct virtio_device *vdev,
+                                 unsigned int offset, void *buf, unsigned len)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       int ret;
+       struct ccw1 *ccw;
+       void *config_area;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+       config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL);
+       if (!config_area)
+               goto out_free;
+       /* Read the config area from the host. */
+       ccw->cmd_code = CCW_CMD_READ_CONF;
+       ccw->flags = 0;
+       ccw->count = offset + len;
+       ccw->cda = (__u32)(unsigned long)config_area;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_CONFIG);
+       if (ret)
+               goto out_free;
+       memcpy(vcdev->config, config_area, sizeof(vcdev->config));
+       memcpy(buf, &vcdev->config[offset], len);
+ out_free:
+       kfree(config_area);
+       kfree(ccw);
+ }
+ static void virtio_ccw_set_config(struct virtio_device *vdev,
+                                 unsigned int offset, const void *buf,
+                                 unsigned len)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct ccw1 *ccw;
+       void *config_area;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+       config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL);
+       if (!config_area)
+               goto out_free;
+       memcpy(&vcdev->config[offset], buf, len);
+       /* Write the config area to the host. */
+       memcpy(config_area, vcdev->config, sizeof(vcdev->config));
+       ccw->cmd_code = CCW_CMD_WRITE_CONF;
+       ccw->flags = 0;
+       ccw->count = offset + len;
+       ccw->cda = (__u32)(unsigned long)config_area;
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_CONFIG);
+ out_free:
+       kfree(config_area);
+       kfree(ccw);
+ }
+ static u8 virtio_ccw_get_status(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       return *vcdev->status;
+ }
+ static void virtio_ccw_set_status(struct virtio_device *vdev, u8 status)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct ccw1 *ccw;
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+       /* Write the status to the host. */
+       *vcdev->status = status;
+       ccw->cmd_code = CCW_CMD_WRITE_STATUS;
+       ccw->flags = 0;
+       ccw->count = sizeof(status);
+       ccw->cda = (__u32)(unsigned long)vcdev->status;
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_STATUS);
+       kfree(ccw);
+ }
+ static struct virtio_config_ops virtio_ccw_config_ops = {
+       .get_features = virtio_ccw_get_features,
+       .finalize_features = virtio_ccw_finalize_features,
+       .get = virtio_ccw_get_config,
+       .set = virtio_ccw_set_config,
+       .get_status = virtio_ccw_get_status,
+       .set_status = virtio_ccw_set_status,
+       .reset = virtio_ccw_reset,
+       .find_vqs = virtio_ccw_find_vqs,
+       .del_vqs = virtio_ccw_del_vqs,
+ };
+ /*
+  * ccw bus driver related functions
+  */
+ static void virtio_ccw_release_dev(struct device *_d)
+ {
+       struct virtio_device *dev = container_of(_d, struct virtio_device,
+                                                dev);
+       struct virtio_ccw_device *vcdev = to_vc_device(dev);
+       kfree(vcdev->status);
+       kfree(vcdev->config_block);
+       kfree(vcdev);
+ }
+ static int irb_is_error(struct irb *irb)
+ {
+       if (scsw_cstat(&irb->scsw) != 0)
+               return 1;
+       if (scsw_dstat(&irb->scsw) & ~(DEV_STAT_CHN_END | DEV_STAT_DEV_END))
+               return 1;
+       if (scsw_cc(&irb->scsw) != 0)
+               return 1;
+       return 0;
+ }
+ static struct virtqueue *virtio_ccw_vq_by_ind(struct virtio_ccw_device *vcdev,
+                                             int index)
+ {
+       struct virtio_ccw_vq_info *info;
+       unsigned long flags;
+       struct virtqueue *vq;
+       vq = NULL;
+       spin_lock_irqsave(&vcdev->lock, flags);
+       list_for_each_entry(info, &vcdev->virtqueues, node) {
+               if (virtqueue_get_queue_index(info->vq) == index) {
+                       vq = info->vq;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&vcdev->lock, flags);
+       return vq;
+ }
+ static void virtio_ccw_int_handler(struct ccw_device *cdev,
+                                  unsigned long intparm,
+                                  struct irb *irb)
+ {
+       __u32 activity = intparm & VIRTIO_CCW_INTPARM_MASK;
+       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       int i;
+       struct virtqueue *vq;
+       struct virtio_driver *drv;
+       /* Check if it's a notification from the host. */
+       if ((intparm == 0) &&
+           (scsw_stctl(&irb->scsw) ==
+            (SCSW_STCTL_ALERT_STATUS | SCSW_STCTL_STATUS_PEND))) {
+               /* OK */
+       }
+       if (irb_is_error(irb))
+               vcdev->err = -EIO; /* XXX - use real error */
+       if (vcdev->curr_io & activity) {
+               switch (activity) {
+               case VIRTIO_CCW_DOING_READ_FEAT:
+               case VIRTIO_CCW_DOING_WRITE_FEAT:
+               case VIRTIO_CCW_DOING_READ_CONFIG:
+               case VIRTIO_CCW_DOING_WRITE_CONFIG:
+               case VIRTIO_CCW_DOING_WRITE_STATUS:
+               case VIRTIO_CCW_DOING_SET_VQ:
+               case VIRTIO_CCW_DOING_SET_IND:
+               case VIRTIO_CCW_DOING_SET_CONF_IND:
+               case VIRTIO_CCW_DOING_RESET:
+               case VIRTIO_CCW_DOING_READ_VQ_CONF:
+                       vcdev->curr_io &= ~activity;
+                       wake_up(&vcdev->wait_q);
+                       break;
+               default:
+                       /* don't know what to do... */
+                       dev_warn(&cdev->dev, "Suspicious activity '%08x'\n",
+                                activity);
+                       WARN_ON(1);
+                       break;
+               }
+       }
+       for_each_set_bit(i, &vcdev->indicators,
+                        sizeof(vcdev->indicators) * BITS_PER_BYTE) {
+               /* The bit clear must happen before the vring kick. */
+               clear_bit(i, &vcdev->indicators);
+               barrier();
+               vq = virtio_ccw_vq_by_ind(vcdev, i);
+               vring_interrupt(0, vq);
+       }
+       if (test_bit(0, &vcdev->indicators2)) {
+               drv = container_of(vcdev->vdev.dev.driver,
+                                  struct virtio_driver, driver);
+               if (drv && drv->config_changed)
+                       drv->config_changed(&vcdev->vdev);
+               clear_bit(0, &vcdev->indicators2);
+       }
+ }
+ /*
+  * We usually want to autoonline all devices, but give the admin
+  * a way to exempt devices from this.
+  */
+ #define __DEV_WORDS ((__MAX_SUBCHANNEL + (8*sizeof(long) - 1)) / \
+                    (8*sizeof(long)))
+ static unsigned long devs_no_auto[__MAX_SSID + 1][__DEV_WORDS];
+ static char *no_auto = "";
+ module_param(no_auto, charp, 0444);
+ MODULE_PARM_DESC(no_auto, "list of ccw bus id ranges not to be auto-onlined");
+ static int virtio_ccw_check_autoonline(struct ccw_device *cdev)
+ {
+       struct ccw_dev_id id;
+       ccw_device_get_id(cdev, &id);
+       if (test_bit(id.devno, devs_no_auto[id.ssid]))
+               return 0;
+       return 1;
+ }
+ static void virtio_ccw_auto_online(void *data, async_cookie_t cookie)
+ {
+       struct ccw_device *cdev = data;
+       int ret;
+       ret = ccw_device_set_online(cdev);
+       if (ret)
+               dev_warn(&cdev->dev, "Failed to set online: %d\n", ret);
+ }
+ static int virtio_ccw_probe(struct ccw_device *cdev)
+ {
+       cdev->handler = virtio_ccw_int_handler;
+       if (virtio_ccw_check_autoonline(cdev))
+               async_schedule(virtio_ccw_auto_online, cdev);
+       return 0;
+ }
+ static void virtio_ccw_remove(struct ccw_device *cdev)
+ {
+       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       if (cdev->online) {
+               unregister_virtio_device(&vcdev->vdev);
+               dev_set_drvdata(&cdev->dev, NULL);
+       }
+       cdev->handler = NULL;
+ }
+ static int virtio_ccw_offline(struct ccw_device *cdev)
+ {
+       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       unregister_virtio_device(&vcdev->vdev);
+       dev_set_drvdata(&cdev->dev, NULL);
+       return 0;
+ }
+ static int virtio_ccw_online(struct ccw_device *cdev)
+ {
+       int ret;
+       struct virtio_ccw_device *vcdev;
+       vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL);
+       if (!vcdev) {
+               dev_warn(&cdev->dev, "Could not get memory for virtio\n");
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       vcdev->config_block = kzalloc(sizeof(*vcdev->config_block),
+                                  GFP_DMA | GFP_KERNEL);
+       if (!vcdev->config_block) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       vcdev->status = kzalloc(sizeof(*vcdev->status), GFP_DMA | GFP_KERNEL);
+       if (!vcdev->status) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       vcdev->vdev.dev.parent = &cdev->dev;
+       vcdev->vdev.dev.release = virtio_ccw_release_dev;
+       vcdev->vdev.config = &virtio_ccw_config_ops;
+       vcdev->cdev = cdev;
+       init_waitqueue_head(&vcdev->wait_q);
+       INIT_LIST_HEAD(&vcdev->virtqueues);
+       spin_lock_init(&vcdev->lock);
+       dev_set_drvdata(&cdev->dev, vcdev);
+       vcdev->vdev.id.vendor = cdev->id.cu_type;
+       vcdev->vdev.id.device = cdev->id.cu_model;
+       ret = register_virtio_device(&vcdev->vdev);
+       if (ret) {
+               dev_warn(&cdev->dev, "Failed to register virtio device: %d\n",
+                        ret);
+               goto out_put;
+       }
+       return 0;
+ out_put:
+       dev_set_drvdata(&cdev->dev, NULL);
+       put_device(&vcdev->vdev.dev);
+       return ret;
+ out_free:
+       if (vcdev) {
+               kfree(vcdev->status);
+               kfree(vcdev->config_block);
+       }
+       kfree(vcdev);
+       return ret;
+ }
+ static int virtio_ccw_cio_notify(struct ccw_device *cdev, int event)
+ {
+       /* TODO: Check whether we need special handling here. */
+       return 0;
+ }
+ static struct ccw_device_id virtio_ids[] = {
+       { CCW_DEVICE(0x3832, 0) },
+       {},
+ };
+ MODULE_DEVICE_TABLE(ccw, virtio_ids);
+ static struct ccw_driver virtio_ccw_driver = {
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = "virtio_ccw",
+       },
+       .ids = virtio_ids,
+       .probe = virtio_ccw_probe,
+       .remove = virtio_ccw_remove,
+       .set_offline = virtio_ccw_offline,
+       .set_online = virtio_ccw_online,
+       .notify = virtio_ccw_cio_notify,
++      .int_class = IRQIO_VIR,
+ };
+ static int __init pure_hex(char **cp, unsigned int *val, int min_digit,
+                          int max_digit, int max_val)
+ {
+       int diff;
+       diff = 0;
+       *val = 0;
+       while (diff <= max_digit) {
+               int value = hex_to_bin(**cp);
+               if (value < 0)
+                       break;
+               *val = *val * 16 + value;
+               (*cp)++;
+               diff++;
+       }
+       if ((diff < min_digit) || (diff > max_digit) || (*val > max_val))
+               return 1;
+       return 0;
+ }
+ static int __init parse_busid(char *str, unsigned int *cssid,
+                             unsigned int *ssid, unsigned int *devno)
+ {
+       char *str_work;
+       int rc, ret;
+       rc = 1;
+       if (*str == '\0')
+               goto out;
+       str_work = str;
+       ret = pure_hex(&str_work, cssid, 1, 2, __MAX_CSSID);
+       if (ret || (str_work[0] != '.'))
+               goto out;
+       str_work++;
+       ret = pure_hex(&str_work, ssid, 1, 1, __MAX_SSID);
+       if (ret || (str_work[0] != '.'))
+               goto out;
+       str_work++;
+       ret = pure_hex(&str_work, devno, 4, 4, __MAX_SUBCHANNEL);
+       if (ret || (str_work[0] != '\0'))
+               goto out;
+       rc = 0;
+ out:
+       return rc;
+ }
+ static void __init no_auto_parse(void)
+ {
+       unsigned int from_cssid, to_cssid, from_ssid, to_ssid, from, to;
+       char *parm, *str;
+       int rc;
+       str = no_auto;
+       while ((parm = strsep(&str, ","))) {
+               rc = parse_busid(strsep(&parm, "-"), &from_cssid,
+                                &from_ssid, &from);
+               if (rc)
+                       continue;
+               if (parm != NULL) {
+                       rc = parse_busid(parm, &to_cssid,
+                                        &to_ssid, &to);
+                       if ((from_ssid > to_ssid) ||
+                           ((from_ssid == to_ssid) && (from > to)))
+                               rc = -EINVAL;
+               } else {
+                       to_cssid = from_cssid;
+                       to_ssid = from_ssid;
+                       to = from;
+               }
+               if (rc)
+                       continue;
+               while ((from_ssid < to_ssid) ||
+                      ((from_ssid == to_ssid) && (from <= to))) {
+                       set_bit(from, devs_no_auto[from_ssid]);
+                       from++;
+                       if (from > __MAX_SUBCHANNEL) {
+                               from_ssid++;
+                               from = 0;
+                       }
+               }
+       }
+ }
+ static int __init virtio_ccw_init(void)
+ {
+       /* parse no_auto string before we do anything further */
+       no_auto_parse();
+       return ccw_driver_register(&virtio_ccw_driver);
+ }
+ module_init(virtio_ccw_init);
+ static void __exit virtio_ccw_exit(void)
+ {
+       ccw_driver_unregister(&virtio_ccw_driver);
+ }
+ module_exit(virtio_ccw_exit);
diff --combined include/linux/kvm_host.h
index b7996a768eb2c656417fdb082f62871a89fc6f1d,722cae78bbc47d932cd41f2989ee4b7b22d85ba8..cad77fe09d770e47a03f8a802bd09fd30b3d136b
@@@ -22,7 -22,6 +22,7 @@@
  #include <linux/rcupdate.h>
  #include <linux/ratelimit.h>
  #include <linux/err.h>
 +#include <linux/irqflags.h>
  #include <asm/signal.h>
  
  #include <linux/kvm.h>
@@@ -123,6 -122,8 +123,8 @@@ static inline bool is_error_page(struc
  #define KVM_REQ_WATCHDOG          18
  #define KVM_REQ_MASTERCLOCK_UPDATE 19
  #define KVM_REQ_MCLOCK_INPROGRESS 20
+ #define KVM_REQ_EPR_EXIT          21
+ #define KVM_REQ_EOIBITMAP         22
  
  #define KVM_USERSPACE_IRQ_SOURCE_ID           0
  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID      1
@@@ -267,12 -268,11 +269,11 @@@ static inline int kvm_vcpu_exiting_gues
  struct kvm_memory_slot {
        gfn_t base_gfn;
        unsigned long npages;
-       unsigned long flags;
        unsigned long *dirty_bitmap;
        struct kvm_arch_memory_slot arch;
        unsigned long userspace_addr;
-       int user_alloc;
-       int id;
+       u32 flags;
+       short id;
  };
  
  static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
@@@ -314,8 -314,12 +315,12 @@@ struct kvm_irq_routing_table {}
  
  #endif
  
+ #ifndef KVM_PRIVATE_MEM_SLOTS
+ #define KVM_PRIVATE_MEM_SLOTS 0
+ #endif
  #ifndef KVM_MEM_SLOTS_NUM
- #define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+ #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
  #endif
  
  /*
@@@ -327,7 -331,7 +332,7 @@@ struct kvm_memslots 
        u64 generation;
        struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
        /* The mapping table from slot id to the index in memslots[]. */
-       int id_to_index[KVM_MEM_SLOTS_NUM];
+       short id_to_index[KVM_MEM_SLOTS_NUM];
  };
  
  struct kvm {
@@@ -425,7 -429,8 +430,8 @@@ void kvm_exit(void)
  
  void kvm_get_kvm(struct kvm *kvm);
  void kvm_put_kvm(struct kvm *kvm);
- void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new);
+ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
+                    u64 last_generation);
  
  static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
  {
@@@ -448,10 -453,10 +454,10 @@@ id_to_memslot(struct kvm_memslots *slot
  
  int kvm_set_memory_region(struct kvm *kvm,
                          struct kvm_userspace_memory_region *mem,
-                         int user_alloc);
+                         bool user_alloc);
  int __kvm_set_memory_region(struct kvm *kvm,
                            struct kvm_userspace_memory_region *mem,
-                           int user_alloc);
+                           bool user_alloc);
  void kvm_arch_free_memslot(struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont);
  int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
@@@ -459,11 -464,11 +465,11 @@@ int kvm_arch_prepare_memory_region(stru
                                struct kvm_memory_slot *memslot,
                                struct kvm_memory_slot old,
                                struct kvm_userspace_memory_region *mem,
-                               int user_alloc);
+                               bool user_alloc);
  void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
                                struct kvm_memory_slot old,
-                               int user_alloc);
+                               bool user_alloc);
  bool kvm_largepages_enabled(void);
  void kvm_disable_largepages(void);
  /* flush all memory translations */
@@@ -533,6 -538,7 +539,7 @@@ void kvm_put_guest_fpu(struct kvm_vcpu 
  void kvm_flush_remote_tlbs(struct kvm *kvm);
  void kvm_reload_remote_mmus(struct kvm *kvm);
  void kvm_make_mclock_inprogress_request(struct kvm *kvm);
+ void kvm_make_update_eoibitmap_request(struct kvm *kvm);
  
  long kvm_arch_dev_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg);
@@@ -550,7 -556,7 +557,7 @@@ int kvm_vm_ioctl_get_dirty_log(struct k
  int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
                                   struct
                                   kvm_userspace_memory_region *mem,
-                                  int user_alloc);
+                                  bool user_alloc);
  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
  long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg);
@@@ -686,6 -692,7 +693,7 @@@ int kvm_set_irq(struct kvm *kvm, int ir
  int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
  int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
                int irq_source_id, int level);
+ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
  void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
@@@ -741,52 -748,15 +749,52 @@@ static inline int kvm_deassign_device(s
  }
  #endif /* CONFIG_IOMMU_API */
  
 -static inline void kvm_guest_enter(void)
 +static inline void __guest_enter(void)
  {
 -      BUG_ON(preemptible());
        /*
         * This is running in ioctl context so we can avoid
         * the call to vtime_account() with its unnecessary idle check.
         */
 -      vtime_account_system_irqsafe(current);
 +      vtime_account_system(current);
        current->flags |= PF_VCPU;
 +}
 +
 +static inline void __guest_exit(void)
 +{
 +      /*
 +       * This is running in ioctl context so we can avoid
 +       * the call to vtime_account() with its unnecessary idle check.
 +       */
 +      vtime_account_system(current);
 +      current->flags &= ~PF_VCPU;
 +}
 +
 +#ifdef CONFIG_CONTEXT_TRACKING
 +extern void guest_enter(void);
 +extern void guest_exit(void);
 +
 +#else /* !CONFIG_CONTEXT_TRACKING */
 +static inline void guest_enter(void)
 +{
 +      __guest_enter();
 +}
 +
 +static inline void guest_exit(void)
 +{
 +      __guest_exit();
 +}
 +#endif /* !CONFIG_CONTEXT_TRACKING */
 +
 +static inline void kvm_guest_enter(void)
 +{
 +      unsigned long flags;
 +
 +      BUG_ON(preemptible());
 +
 +      local_irq_save(flags);
 +      guest_enter();
 +      local_irq_restore(flags);
 +
        /* KVM does not hold any references to rcu protected data when it
         * switches CPU into a guest mode. In fact switching to a guest mode
         * is very similar to exiting to userspase from rcu point of view. In
  
  static inline void kvm_guest_exit(void)
  {
 -      /*
 -       * This is running in ioctl context so we can avoid
 -       * the call to vtime_account() with its unnecessary idle check.
 -       */
 -      vtime_account_system_irqsafe(current);
 -      current->flags &= ~PF_VCPU;
 +      unsigned long flags;
 +
 +      local_irq_save(flags);
 +      guest_exit();
 +      local_irq_restore(flags);
  }
  
  /*
diff --combined include/uapi/linux/kvm.h
index c70577cf67bc23b03fdb48c1caf5911e64e787bd,9a2db5767ed5913a69fe2383e166842a7d619e6c..3c56ba3d80c16007f9eda468f96bf337f998c1a9
@@@ -115,7 -115,6 +115,7 @@@ struct kvm_irq_level 
         * ACPI gsi notion of irq.
         * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
         * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
 +       * For ARM: See Documentation/virtual/kvm/api.txt
         */
        union {
                __u32 irq;
@@@ -169,6 -168,8 +169,8 @@@ struct kvm_pit_config 
  #define KVM_EXIT_PAPR_HCALL     19
  #define KVM_EXIT_S390_UCONTROL          20
  #define KVM_EXIT_WATCHDOG         21
+ #define KVM_EXIT_S390_TSCH        22
+ #define KVM_EXIT_EPR              23
  
  /* For KVM_EXIT_INTERNAL_ERROR */
  /* Emulate instruction failed. */
@@@ -286,6 -287,19 +288,19 @@@ struct kvm_run 
                        __u64 ret;
                        __u64 args[9];
                } papr_hcall;
+               /* KVM_EXIT_S390_TSCH */
+               struct {
+                       __u16 subchannel_id;
+                       __u16 subchannel_nr;
+                       __u32 io_int_parm;
+                       __u32 io_int_word;
+                       __u32 ipb;
+                       __u8 dequeued;
+               } s390_tsch;
+               /* KVM_EXIT_EPR */
+               struct {
+                       __u32 epr;
+               } epr;
                /* Fix the size of the union. */
                char padding[256];
        };
@@@ -398,10 -412,20 +413,20 @@@ struct kvm_s390_psw 
  #define KVM_S390_PROGRAM_INT          0xfffe0001u
  #define KVM_S390_SIGP_SET_PREFIX      0xfffe0002u
  #define KVM_S390_RESTART              0xfffe0003u
+ #define KVM_S390_MCHK                 0xfffe1000u
  #define KVM_S390_INT_VIRTIO           0xffff2603u
  #define KVM_S390_INT_SERVICE          0xffff2401u
  #define KVM_S390_INT_EMERGENCY                0xffff1201u
  #define KVM_S390_INT_EXTERNAL_CALL    0xffff1202u
+ /* Anything below 0xfffe0000u is taken by INT_IO */
+ #define KVM_S390_INT_IO(ai,cssid,ssid,schid)   \
+       (((schid)) |                           \
+        ((ssid) << 16) |                      \
+        ((cssid) << 18) |                     \
+        ((ai) << 26))
+ #define KVM_S390_INT_IO_MIN           0x00000000u
+ #define KVM_S390_INT_IO_MAX           0xfffdffffu
  
  struct kvm_s390_interrupt {
        __u32 type;
@@@ -636,8 -660,8 +661,10 @@@ struct kvm_ppc_smmu_info 
  #define KVM_CAP_IRQFD_RESAMPLE 82
  #define KVM_CAP_PPC_BOOKE_WATCHDOG 83
  #define KVM_CAP_PPC_HTAB_FD 84
+ #define KVM_CAP_S390_CSS_SUPPORT 85
+ #define KVM_CAP_PPC_EPR 86
 +#define KVM_CAP_ARM_PSCI 87
 +#define KVM_CAP_ARM_SET_DEVICE_ADDR 88
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
@@@ -767,11 -791,6 +794,11 @@@ struct kvm_dirty_tlb 
  #define KVM_REG_SIZE_U512     0x0060000000000000ULL
  #define KVM_REG_SIZE_U1024    0x0070000000000000ULL
  
 +struct kvm_reg_list {
 +      __u64 n; /* number of regs */
 +      __u64 reg[0];
 +};
 +
  struct kvm_one_reg {
        __u64 id;
        __u64 addr;
@@@ -785,11 -804,6 +812,11 @@@ struct kvm_msi 
        __u8  pad[16];
  };
  
 +struct kvm_arm_device_addr {
 +      __u64 id;
 +      __u64 addr;
 +};
 +
  /*
   * ioctls for VM fds
   */
@@@ -875,8 -889,6 +902,8 @@@ struct kvm_s390_ucas_mapping 
  #define KVM_ALLOCATE_RMA        _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
  /* Available with KVM_CAP_PPC_HTAB_FD */
  #define KVM_PPC_GET_HTAB_FD     _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
 +/* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
 +#define KVM_ARM_SET_DEVICE_ADDR         _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
  
  /*
   * ioctls for vcpu fds
  #define KVM_SET_ONE_REG                 _IOW(KVMIO,  0xac, struct kvm_one_reg)
  /* VM is being stopped by host */
  #define KVM_KVMCLOCK_CTRL       _IO(KVMIO,   0xad)
 +#define KVM_ARM_VCPU_INIT       _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
 +#define KVM_GET_REG_LIST        _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
  
  #define KVM_DEV_ASSIGN_ENABLE_IOMMU   (1 << 0)
  #define KVM_DEV_ASSIGN_PCI_2_3                (1 << 1)
diff --combined kernel/sched/core.c
index 053dfd7692d1ffafefa38698d9cc0e1277955013,01edad9b5d71f3603a0b1602e01272189e20301a..f1bdecf09afb593560f01309b791b5dcb1ed45d6
@@@ -83,7 -83,7 +83,7 @@@
  #endif
  
  #include "sched.h"
 -#include "../workqueue_sched.h"
 +#include "../workqueue_internal.h"
  #include "../smpboot.h"
  
  #define CREATE_TRACE_POINTS
@@@ -193,10 -193,23 +193,10 @@@ static void sched_feat_disable(int i) 
  static void sched_feat_enable(int i) { };
  #endif /* HAVE_JUMP_LABEL */
  
 -static ssize_t
 -sched_feat_write(struct file *filp, const char __user *ubuf,
 -              size_t cnt, loff_t *ppos)
 +static int sched_feat_set(char *cmp)
  {
 -      char buf[64];
 -      char *cmp;
 -      int neg = 0;
        int i;
 -
 -      if (cnt > 63)
 -              cnt = 63;
 -
 -      if (copy_from_user(&buf, ubuf, cnt))
 -              return -EFAULT;
 -
 -      buf[cnt] = 0;
 -      cmp = strstrip(buf);
 +      int neg = 0;
  
        if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
                }
        }
  
 +      return i;
 +}
 +
 +static ssize_t
 +sched_feat_write(struct file *filp, const char __user *ubuf,
 +              size_t cnt, loff_t *ppos)
 +{
 +      char buf[64];
 +      char *cmp;
 +      int i;
 +
 +      if (cnt > 63)
 +              cnt = 63;
 +
 +      if (copy_from_user(&buf, ubuf, cnt))
 +              return -EFAULT;
 +
 +      buf[cnt] = 0;
 +      cmp = strstrip(buf);
 +
 +      i = sched_feat_set(cmp);
        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
  
@@@ -1132,28 -1124,18 +1132,28 @@@ EXPORT_SYMBOL_GPL(kick_process)
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
 -      const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
 +      int nid = cpu_to_node(cpu);
 +      const struct cpumask *nodemask = NULL;
        enum { cpuset, possible, fail } state = cpuset;
        int dest_cpu;
  
 -      /* Look for allowed, online CPU in same node. */
 -      for_each_cpu(dest_cpu, nodemask) {
 -              if (!cpu_online(dest_cpu))
 -                      continue;
 -              if (!cpu_active(dest_cpu))
 -                      continue;
 -              if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 -                      return dest_cpu;
 +      /*
 +       * If the node that the cpu is on has been offlined, cpu_to_node()
 +       * will return -1. There is no cpu on the node, and we should
 +       * select the cpu on the other node.
 +       */
 +      if (nid != -1) {
 +              nodemask = cpumask_of_node(nid);
 +
 +              /* Look for allowed, online CPU in same node. */
 +              for_each_cpu(dest_cpu, nodemask) {
 +                      if (!cpu_online(dest_cpu))
 +                              continue;
 +                      if (!cpu_active(dest_cpu))
 +                              continue;
 +                      if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 +                              return dest_cpu;
 +              }
        }
  
        for (;;) {
@@@ -1533,8 -1515,7 +1533,8 @@@ out
   */
  int wake_up_process(struct task_struct *p)
  {
 -      return try_to_wake_up(p, TASK_ALL, 0);
 +      WARN_ON(task_is_stopped_or_traced(p));
 +      return try_to_wake_up(p, TASK_NORMAL, 0);
  }
  EXPORT_SYMBOL(wake_up_process);
  
@@@ -1579,40 -1560,7 +1579,40 @@@ static void __sched_fork(struct task_st
  #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
 +
 +#ifdef CONFIG_NUMA_BALANCING
 +      if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
 +              p->mm->numa_next_scan = jiffies;
 +              p->mm->numa_next_reset = jiffies;
 +              p->mm->numa_scan_seq = 0;
 +      }
 +
 +      p->node_stamp = 0ULL;
 +      p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
 +      p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
 +      p->numa_scan_period = sysctl_numa_balancing_scan_delay;
 +      p->numa_work.next = &p->numa_work;
 +#endif /* CONFIG_NUMA_BALANCING */
 +}
 +
 +#ifdef CONFIG_NUMA_BALANCING
 +#ifdef CONFIG_SCHED_DEBUG
 +void set_numabalancing_state(bool enabled)
 +{
 +      if (enabled)
 +              sched_feat_set("NUMA");
 +      else
 +              sched_feat_set("NO_NUMA");
  }
 +#else
 +__read_mostly bool numabalancing_enabled;
 +
 +void set_numabalancing_state(bool enabled)
 +{
 +      numabalancing_enabled = enabled;
 +}
 +#endif /* CONFIG_SCHED_DEBUG */
 +#endif /* CONFIG_NUMA_BALANCING */
  
  /*
   * fork()/clone()-time setup:
@@@ -4108,14 -4056,8 +4108,14 @@@ long sched_setaffinity(pid_t pid, cons
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
 -      if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
 -              goto out_unlock;
 +      if (!check_same_owner(p)) {
 +              rcu_read_lock();
 +              if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
 +                      rcu_read_unlock();
 +                      goto out_unlock;
 +              }
 +              rcu_read_unlock();
 +      }
  
        retval = security_task_setscheduler(p);
        if (retval)
@@@ -4374,20 -4316,32 +4374,32 @@@ EXPORT_SYMBOL(yield)
   * It's the caller's job to ensure that the target task struct
   * can't go away on us before we can do any checks.
   *
-  * Returns true if we indeed boosted the target task.
+  * Returns:
+  *    true (>0) if we indeed boosted the target task.
+  *    false (0) if we failed to boost the target.
+  *    -ESRCH if there's no task to yield to.
   */
  bool __sched yield_to(struct task_struct *p, bool preempt)
  {
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
        unsigned long flags;
 -      bool yielded = 0;
 +      int yielded = 0;
  
        local_irq_save(flags);
        rq = this_rq();
  
  again:
        p_rq = task_rq(p);
+       /*
+        * If we're the only runnable task on the rq and target rq also
+        * has only one task, there's absolutely no point in yielding.
+        */
+       if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+               yielded = -ESRCH;
+               goto out_irq;
+       }
        double_rq_lock(rq, p_rq);
        while (task_rq(p) != p_rq) {
                double_rq_unlock(rq, p_rq);
        }
  
        if (!curr->sched_class->yield_to_task)
-               goto out;
+               goto out_unlock;
  
        if (curr->sched_class != p->sched_class)
-               goto out;
+               goto out_unlock;
  
        if (task_running(p_rq, p) || p->state)
-               goto out;
+               goto out_unlock;
  
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
                        resched_task(p_rq->curr);
        }
  
- out:
+ out_unlock:
        double_rq_unlock(rq, p_rq);
+ out_irq:
        local_irq_restore(flags);
  
-       if (yielded)
+       if (yielded > 0)
                schedule();
  
        return yielded;
@@@ -4677,7 -4632,6 +4690,7 @@@ void __cpuinit init_idle(struct task_st
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
 +      vtime_init_idle(idle);
  #if defined(CONFIG_SMP)
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
  #endif
@@@ -7171,6 -7125,7 +7184,6 @@@ static void free_sched_group(struct tas
  struct task_group *sched_create_group(struct task_group *parent)
  {
        struct task_group *tg;
 -      unsigned long flags;
  
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
  
 +      return tg;
 +
 +err:
 +      free_sched_group(tg);
 +      return ERR_PTR(-ENOMEM);
 +}
 +
 +void sched_online_group(struct task_group *tg, struct task_group *parent)
 +{
 +      unsigned long flags;
 +
        spin_lock_irqsave(&task_group_lock, flags);
        list_add_rcu(&tg->list, &task_groups);
  
        INIT_LIST_HEAD(&tg->children);
        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
 -
 -      return tg;
 -
 -err:
 -      free_sched_group(tg);
 -      return ERR_PTR(-ENOMEM);
  }
  
  /* rcu callback to free various structures associated with a task group */
@@@ -7213,12 -7163,6 +7226,12 @@@ static void free_sched_group_rcu(struc
  
  /* Destroy runqueue etc associated with a task group */
  void sched_destroy_group(struct task_group *tg)
 +{
 +      /* wait for possible concurrent references to cfs_rqs complete */
 +      call_rcu(&tg->rcu, free_sched_group_rcu);
 +}
 +
 +void sched_offline_group(struct task_group *tg)
  {
        unsigned long flags;
        int i;
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
 -
 -      /* wait for possible concurrent references to cfs_rqs complete */
 -      call_rcu(&tg->rcu, free_sched_group_rcu);
  }
  
  /* change task's runqueue when it moves between groups.
@@@ -7526,25 -7473,6 +7539,25 @@@ static int sched_rt_global_constraints(
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
 +int sched_rr_handler(struct ctl_table *table, int write,
 +              void __user *buffer, size_t *lenp,
 +              loff_t *ppos)
 +{
 +      int ret;
 +      static DEFINE_MUTEX(mutex);
 +
 +      mutex_lock(&mutex);
 +      ret = proc_dointvec(table, write, buffer, lenp, ppos);
 +      /* make sure that internally we keep jiffies */
 +      /* also, writing zero resets timeslice to default */
 +      if (!ret && write) {
 +              sched_rr_timeslice = sched_rr_timeslice <= 0 ?
 +                      RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
 +      }
 +      mutex_unlock(&mutex);
 +      return ret;
 +}
 +
  int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@@ -7601,19 -7529,6 +7614,19 @@@ static struct cgroup_subsys_state *cpu_
        return &tg->css;
  }
  
 +static int cpu_cgroup_css_online(struct cgroup *cgrp)
 +{
 +      struct task_group *tg = cgroup_tg(cgrp);
 +      struct task_group *parent;
 +
 +      if (!cgrp->parent)
 +              return 0;
 +
 +      parent = cgroup_tg(cgrp->parent);
 +      sched_online_group(tg, parent);
 +      return 0;
 +}
 +
  static void cpu_cgroup_css_free(struct cgroup *cgrp)
  {
        struct task_group *tg = cgroup_tg(cgrp);
        sched_destroy_group(tg);
  }
  
 +static void cpu_cgroup_css_offline(struct cgroup *cgrp)
 +{
 +      struct task_group *tg = cgroup_tg(cgrp);
 +
 +      sched_offline_group(tg);
 +}
 +
  static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                 struct cgroup_taskset *tset)
  {
@@@ -7983,8 -7891,6 +7996,8 @@@ struct cgroup_subsys cpu_cgroup_subsys 
        .name           = "cpu",
        .css_alloc      = cpu_cgroup_css_alloc,
        .css_free       = cpu_cgroup_css_free,
 +      .css_online     = cpu_cgroup_css_online,
 +      .css_offline    = cpu_cgroup_css_offline,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,