Merge tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)
diff --combined Documentation/virtual/kvm/api.txt

index e0fa0ea2b1870f17291acc9890c47150f1e092f5,c2534c300a45489bc3d3cd2ecc8b984553f91fe4..119358dfb74295af7e95c6ecdcda35e9cf1d8f17
--- 1/Documentation/virtual/kvm/api.txt
--- 2/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@@ -219,19 -219,6 +219,6 @@@ allocation of vcpu ids.  For example, i
   single-threaded guest vcpus, it should make all vcpu ids be a multiple
   of the number of vcpus per vcore.
   
- On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
- threads in one or more virtual CPU cores.  (This is because the
- hardware requires all the hardware threads in a CPU core to be in the
- same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
- of vcpus per virtual core (vcore).  The vcore id is obtained by
- dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
- given vcore will always be in the same physical core as each other
- (though that might be a different physical core from time to time).
- Userspace can control the threading (SMT) mode of the guest by its
- allocation of vcpu ids.  For example, if userspace wants
- single-threaded guest vcpus, it should make all vcpu ids be a multiple
- of the number of vcpus per vcore.
- 
   For virtual cpus that have been created with S390 user controlled virtual
   machines, the resulting vcpu fd can be memory mapped at page offset
   KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
@@@ -293,7 -280,7 +280,7 @@@ kvm_run' (see below)
   4.11 KVM_GET_REGS
   
   Capability: basic
- -Architectures: all
+ +Architectures: all except ARM
   Type: vcpu ioctl
   Parameters: struct kvm_regs (out)
   Returns: 0 on success, -1 on error
@@@ -314,7 -301,7 +301,7 @@@ struct kvm_regs 
   4.12 KVM_SET_REGS
   
   Capability: basic
- -Architectures: all
+ +Architectures: all except ARM
   Type: vcpu ioctl
   Parameters: struct kvm_regs (in)
   Returns: 0 on success, -1 on error
@@@ -345,7 -332,7 +332,7 @@@ struct kvm_sregs 
         __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
   };
   
- /* ppc -- see arch/powerpc/include/asm/kvm.h */
+ /* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */
   
   interrupt_bitmap is a bitmap of pending external interrupts.  At most
   one bit may be set.  This interrupt has been acknowledged by the APIC
@@@ -600,7 -587,7 +587,7 @@@ struct kvm_fpu 
   4.24 KVM_CREATE_IRQCHIP
   
   Capability: KVM_CAP_IRQCHIP
- -Architectures: x86, ia64
+ +Architectures: x86, ia64, ARM
   Type: vm ioctl
   Parameters: none
   Returns: 0 on success, -1 on error
@@@ -608,39 -595,21 +595,39 @@@
   Creates an interrupt controller model in the kernel.  On x86, creates a virtual
   ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
   local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
- -only go to the IOAPIC.  On ia64, a IOSAPIC is created.
+ +only go to the IOAPIC.  On ia64, a IOSAPIC is created. On ARM, a GIC is
+ +created.
   
   
   4.25 KVM_IRQ_LINE
   
   Capability: KVM_CAP_IRQCHIP
- -Architectures: x86, ia64
+ +Architectures: x86, ia64, arm
   Type: vm ioctl
   Parameters: struct kvm_irq_level
   Returns: 0 on success, -1 on error
   
   Sets the level of a GSI input to the interrupt controller model in the kernel.
- -Requires that an interrupt controller model has been previously created with
- -KVM_CREATE_IRQCHIP.  Note that edge-triggered interrupts require the level
- -to be set to 1 and then back to 0.
+ +On some architectures it is required that an interrupt controller model has
+ +been previously created with KVM_CREATE_IRQCHIP.  Note that edge-triggered
+ +interrupts require the level to be set to 1 and then back to 0.
+ +
+ +ARM can signal an interrupt either at the CPU level, or at the in-kernel irqchip
+ +(GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated for
+ +specific cpus.  The irq field is interpreted like this:
+ +
+ +  bits:  | 31 ... 24 | 23  ... 16 | 15    ...    0 |
+ +  field: | irq_type  | vcpu_index |     irq_id     |
+ +
+ +The irq_type field has the following values:
+ +- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ
+ +- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.)
+ +               (the vcpu_index field is ignored)
+ +- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.)
+ +
+ +(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
+ +
+ +In both cases, level is used to raise/lower the line.
   
   struct kvm_irq_level {
         union {
@@@ -892,12 -861,12 +879,12 @@@ It is recommended that the lower 21 bit
   be identical.  This allows large pages in the guest to be backed by large
   pages in the host.
   
- The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs
- kvm to keep track of writes to memory within the slot.  See KVM_GET_DIRTY_LOG
- ioctl.  The KVM_CAP_READONLY_MEM capability indicates the availability of the
- KVM_MEM_READONLY flag.  When this flag is set for a memory region, KVM only
- allows read accesses.  Writes will be posted to userspace as KVM_EXIT_MMIO
- exits.
+ The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
+ KVM_MEM_READONLY.  The former can be set to instruct KVM to keep track of
+ writes to memory within the slot.  See KVM_GET_DIRTY_LOG ioctl to know how to
+ use it.  The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
+ to make a new slot read-only.  In this case, writes to this memory will be
+ posted to userspace as KVM_EXIT_MMIO exits.
   
   When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
   the memory region are automatically reflected into the guest.  For example, an
@@@ -931,7 -900,7 +918,7 @@@ documentation when it pops into existen
   4.37 KVM_ENABLE_CAP
   
   Capability: KVM_CAP_ENABLE_CAP
- Architectures: ppc
+ Architectures: ppc, s390
   Type: vcpu ioctl
   Parameters: struct kvm_enable_cap (in)
   Returns: 0 on success; -1 on error
@@@ -1792,28 -1761,8 +1779,29 @@@ registers, find a list below
     PPC   | KVM_REG_PPC_VPA_SLB   | 128
     PPC   | KVM_REG_PPC_VPA_DTL   | 128
     PPC   | KVM_REG_PPC_EPCR    | 32
+   PPC   | KVM_REG_PPC_EPR     | 32
   
+ +ARM registers are mapped using the lower 32 bits.  The upper 16 of that
+ +is the register group type, or coprocessor number:
+ +
+ +ARM core registers have the following id bit patterns:
+ +  0x4002 0000 0010 <index into the kvm_regs struct:16>
+ +
+ +ARM 32-bit CP15 registers have the following id bit patterns:
+ +  0x4002 0000 000F <zero:1> <crn:4> <crm:4> <opc1:4> <opc2:3>
+ +
+ +ARM 64-bit CP15 registers have the following id bit patterns:
+ +  0x4003 0000 000F <zero:1> <zero:4> <crm:4> <opc1:4> <zero:3>
+ +
+ +ARM CCSIDR registers are demultiplexed by CSSELR value:
+ +  0x4002 0000 0011 00 <csselr:8>
+ +
+ +ARM 32-bit VFP control registers have the following id bit patterns:
+ +  0x4002 0000 0012 1 <regno:12>
+ +
+ +ARM 64-bit FP registers have the following id bit patterns:
+ +  0x4002 0000 0012 0 <regno:12>
+ +
   4.69 KVM_GET_ONE_REG
   
   Capability: KVM_CAP_ONE_REG
@@@ -2108,6 -2057,14 +2096,14 @@@ KVM_S390_INT_VIRTIO (vm) - virtio exter
   KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
   KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
   KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
+ KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an
+     I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel);
+     I/O interruption parameters in parm (subchannel) and parm64 (intparm,
+     interruption subclass)
+ KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm,
+                            machine check interrupt code in parm64 (note that
+                            machine checks needing further payload are not
+                            supported by this ioctl)
   
   Note that the vcpu ioctl is asynchronous to vcpu execution.
   
@@@ -2166,88 -2123,6 +2162,88 @@@ written, then `n_invalid' invalid entri
   valid entries found.
   
   
+ +4.77 KVM_ARM_VCPU_INIT
+ +
+ +Capability: basic
+ +Architectures: arm
+ +Type: vcpu ioctl
+ +Parameters: struct struct kvm_vcpu_init (in)
+ +Returns: 0 on success; -1 on error
+ +Errors:
+ +  EINVAL:    the target is unknown, or the combination of features is invalid.
+ +  ENOENT:    a features bit specified is unknown.
+ +
+ +This tells KVM what type of CPU to present to the guest, and what
+ +optional features it should have.  This will cause a reset of the cpu
+ +registers to their initial values.  If this is not called, KVM_RUN will
+ +return ENOEXEC for that vcpu.
+ +
+ +Note that because some registers reflect machine topology, all vcpus
+ +should be created before this ioctl is invoked.
+ +
+ +Possible features:
+ +      - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
+ +        Depends on KVM_CAP_ARM_PSCI.
+ +
+ +
+ +4.78 KVM_GET_REG_LIST
+ +
+ +Capability: basic
+ +Architectures: arm
+ +Type: vcpu ioctl
+ +Parameters: struct kvm_reg_list (in/out)
+ +Returns: 0 on success; -1 on error
+ +Errors:
+ +  E2BIG:     the reg index list is too big to fit in the array specified by
+ +             the user (the number required will be written into n).
+ +
+ +struct kvm_reg_list {
+ +      __u64 n; /* number of registers in reg[] */
+ +      __u64 reg[0];
+ +};
+ +
+ +This ioctl returns the guest registers that are supported for the
+ +KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
+ +
+ +
+ +4.80 KVM_ARM_SET_DEVICE_ADDR
+ +
+ +Capability: KVM_CAP_ARM_SET_DEVICE_ADDR
+ +Architectures: arm
+ +Type: vm ioctl
+ +Parameters: struct kvm_arm_device_address (in)
+ +Returns: 0 on success, -1 on error
+ +Errors:
+ +  ENODEV: The device id is unknown
+ +  ENXIO:  Device not supported on current system
+ +  EEXIST: Address already set
+ +  E2BIG:  Address outside guest physical address space
+ +  EBUSY:  Address overlaps with other device range
+ +
+ +struct kvm_arm_device_addr {
+ +      __u64 id;
+ +      __u64 addr;
+ +};
+ +
+ +Specify a device address in the guest's physical address space where guests
+ +can access emulated or directly exposed devices, which the host kernel needs
+ +to know about. The id field is an architecture specific identifier for a
+ +specific device.
+ +
+ +ARM divides the id field into two parts, a device id and an address type id
+ +specific to the individual device.
+ +
+ +  bits:  | 63        ...       32 | 31    ...    16 | 15    ...    0 |
+ +  field: |        0x00000000      |     device id   |  addr type id  |
+ +
+ +ARM currently only require this when using the in-kernel GIC support for the
+ +hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 as the device id.  When
+ +setting the base address for the guest's mapping of the VGIC virtual CPU
+ +and distributor interface, the ioctl must be called after calling
+ +KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs.  Calling
+ +this ioctl twice for any of the base addresses will return -EEXIST.
+ +
+ +
   5. The kvm_run structure
   ------------------------
   
@@@ -2359,8 -2234,8 +2355,8 @@@ executed a memory-mapped I/O instructio
   by kvm.  The 'data' member contains the written data if 'is_write' is
   true, and should be filled by application code otherwise.
   
- NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR
-       and KVM_EXIT_PAPR the corresponding
+ NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
+       KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
   operations are complete (and guest state is consistent) only after userspace
   has re-entered the kernel with KVM_RUN.  The kernel side will first finish
   incomplete operations and then check for pending signals.  Userspace
@@@ -2463,6 -2338,41 +2459,41 @@@ The possible hypercalls are defined in 
   Requirements (PAPR) document available from www.power.org (free
   developer registration required to access it).
   
+               /* KVM_EXIT_S390_TSCH */
+               struct {
+                       __u16 subchannel_id;
+                       __u16 subchannel_nr;
+                       __u32 io_int_parm;
+                       __u32 io_int_word;
+                       __u32 ipb;
+                       __u8 dequeued;
+               } s390_tsch;
+ 
+ s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled
+ and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O
+ interrupt for the target subchannel has been dequeued and subchannel_id,
+ subchannel_nr, io_int_parm and io_int_word contain the parameters for that
+ interrupt. ipb is needed for instruction parameter decoding.
+ 
+               /* KVM_EXIT_EPR */
+               struct {
+                       __u32 epr;
+               } epr;
+ 
+ On FSL BookE PowerPC chips, the interrupt controller has a fast patch
+ interrupt acknowledge path to the core. When the core successfully
+ delivers an interrupt, it automatically populates the EPR register with
+ the interrupt vector number and acknowledges the interrupt inside
+ the interrupt controller.
+ 
+ In case the interrupt controller lives in user space, we need to do
+ the interrupt acknowledge cycle through it to fetch the next to be
+ delivered interrupt vector using this exit.
+ 
+ It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an
+ external interrupt has just been delivered into the guest. User space
+ should put the acknowledged interrupt vector into the 'epr' field.
+ 
                 /* Fix the size of the union. */
                 char padding[256];
         };
@@@ -2584,3 -2494,34 +2615,34 @@@ For mmu types KVM_MMU_FSL_BOOKE_NOHV an
      where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value.
    - The tsize field of mas1 shall be set to 4K on TLB0, even though the
      hardware ignores this value for TLB0.
+ 
+ 6.4 KVM_CAP_S390_CSS_SUPPORT
+ 
+ Architectures: s390
+ Parameters: none
+ Returns: 0 on success; -1 on error
+ 
+ This capability enables support for handling of channel I/O instructions.
+ 
+ TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are
+ handled in-kernel, while the other I/O instructions are passed to userspace.
+ 
+ When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST
+ SUBCHANNEL intercepts.
+ 
+ 6.5 KVM_CAP_PPC_EPR
+ 
+ Architectures: ppc
+ Parameters: args[0] defines whether the proxy facility is active
+ Returns: 0 on success; -1 on error
+ 
+ This capability enables or disables the delivery of interrupts through the
+ external proxy facility.
+ 
+ When enabled (args[0] != 0), every time the guest gets an external interrupt
+ delivered, it automatically exits into user space with a KVM_EXIT_EPR exit
+ to receive the topmost interrupt vector.
+ 
+ When disabled (args[0] == 0), behavior is as if this facility is unsupported.
+ 
+ When this capability is enabled, KVM_EXIT_EPR can occur.
diff --combined arch/powerpc/include/asm/kvm_host.h

index 03d7beae89a0ebb7fd0e66187db5c643d51b00f3,8a72d59467ebedd47356301813b79c887c5de6ee..d1bb86074721cf5394da11aca171fee1981bd72b
--- 1/arch/powerpc/include/asm/kvm_host.h
--- 2/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@@ -37,10 -37,8 +37,8 @@@
   
   #define KVM_MAX_VCPUS         NR_CPUS
   #define KVM_MAX_VCORES                NR_CPUS
- #define KVM_MEMORY_SLOTS 32
- /* memory slots that does not exposed to userspace */
- #define KVM_PRIVATE_MEM_SLOTS 4
- #define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+ #define KVM_USER_MEM_SLOTS 32
+ #define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS
   
   #ifdef CONFIG_KVM_MMIO
   #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@@ -440,7 -438,6 +438,7 @@@ struct kvm_vcpu_arch 
         ulong uamor;
         u32 ctrl;
         ulong dabr;
+ +      ulong cfar;
   #endif
         u32 vrsave; /* also USPRG0 */
         u32 mmucr;
@@@ -523,6 -520,8 +521,8 @@@
         u8 sane;
         u8 cpu_type;
         u8 hcall_needed;
+       u8 epr_enabled;
+       u8 epr_needed;
   
         u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
   
diff --combined arch/powerpc/include/asm/reg.h

index 7035e608f3fa153fe248606a320af14fe0bed6dd,11ae3d8ba3a2a95fb4672a6e5a18b04b145d4539..e6658612203010aec0d3ba6ebb1f9c646e639575
--- 1/arch/powerpc/include/asm/reg.h
--- 2/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@@ -29,10 -29,6 +29,10 @@@
   #define MSR_SF_LG     63              /* Enable 64 bit mode */
   #define MSR_ISF_LG    61              /* Interrupt 64b mode valid on 630 */
   #define MSR_HV_LG     60              /* Hypervisor state */
+ +#define MSR_TS_T_LG   34              /* Trans Mem state: Transactional */
+ +#define MSR_TS_S_LG   33              /* Trans Mem state: Suspended */
+ +#define MSR_TS_LG     33              /* Trans Mem state (2 bits) */
+ +#define MSR_TM_LG     32              /* Trans Mem Available */
   #define MSR_VEC_LG    25              /* Enable AltiVec */
   #define MSR_VSX_LG    23              /* Enable VSX */
   #define MSR_POW_LG    18              /* Enable Power Management */
@@@ -102,26 -98,6 +102,26 @@@
   #define MSR_RI                __MASK(MSR_RI_LG)       /* Recoverable Exception */
   #define MSR_LE                __MASK(MSR_LE_LG)       /* Little Endian */
   
+ +#define MSR_TM                __MASK(MSR_TM_LG)       /* Transactional Mem Available */
+ +#define MSR_TS_N      0                       /*  Non-transactional */
+ +#define MSR_TS_S      __MASK(MSR_TS_S_LG)     /*  Transaction Suspended */
+ +#define MSR_TS_T      __MASK(MSR_TS_T_LG)     /*  Transaction Transactional */
+ +#define MSR_TS_MASK   (MSR_TS_T | MSR_TS_S)   /* Transaction State bits */
+ +#define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */
+ +#define MSR_TM_TRANSACTIONAL(x)       (((x) & MSR_TS_MASK) == MSR_TS_T)
+ +#define MSR_TM_SUSPENDED(x)   (((x) & MSR_TS_MASK) == MSR_TS_S)
+ +
+ +/* Reason codes describing kernel causes for transaction aborts.  By
+ +   convention, bit0 is copied to TEXASR[56] (IBM bit 7) which is set if
+ +   the failure is persistent.
+ +*/
+ +#define TM_CAUSE_RESCHED      0xfe
+ +#define TM_CAUSE_TLBI         0xfc
+ +#define TM_CAUSE_FAC_UNAV     0xfa
+ +#define TM_CAUSE_SYSCALL      0xf9 /* Persistent */
+ +#define TM_CAUSE_MISC         0xf6
+ +#define TM_CAUSE_SIGNAL               0xf4
+ +
   #if defined(CONFIG_PPC_BOOK3S_64)
   #define MSR_64BIT     MSR_SF
   
@@@ -217,10 -193,6 +217,10 @@@
   #define SPRN_UAMOR    0x9d    /* User Authority Mask Override Register */
   #define SPRN_AMOR     0x15d   /* Authority Mask Override Register */
   #define SPRN_ACOP     0x1F    /* Available Coprocessor Register */
+ +#define SPRN_TFIAR    0x81    /* Transaction Failure Inst Addr   */
+ +#define SPRN_TEXASR   0x82    /* Transaction EXception & Summary */
+ +#define SPRN_TEXASRU  0x83    /* ''      ''      ''    Upper 32  */
+ +#define SPRN_TFHAR    0x80    /* Transaction Failure Handler Addr */
   #define SPRN_CTRLF    0x088
   #define SPRN_CTRLT    0x098
   #define   CTRL_CT     0xc0000000      /* current thread */
@@@ -228,12 -200,10 +228,12 @@@
   #define   CTRL_CT1    0x40000000      /* thread 1 */
   #define   CTRL_TE     0x00c00000      /* thread enable */
   #define   CTRL_RUNLATCH       0x1
+ +#define SPRN_DAWR     0xB4
+ +#define SPRN_DAWRX    0xBC
+ +#define   DAWRX_USER  (1UL << 0)
+ +#define   DAWRX_KERNEL        (1UL << 1)
+ +#define   DAWRX_HYP   (1UL << 2)
   #define SPRN_DABR     0x3F5   /* Data Address Breakpoint Register */
- -#define   DABR_TRANSLATION    (1UL << 2)
- -#define   DABR_DATA_WRITE     (1UL << 1)
- -#define   DABR_DATA_READ      (1UL << 0)
   #define SPRN_DABR2    0x13D   /* e300 */
   #define SPRN_DABRX    0x3F7   /* Data Address Breakpoint Register Extension */
   #define   DABRX_USER  (1UL << 0)
@@@ -265,9 -235,6 +265,9 @@@
   #define SPRN_HRMOR    0x139   /* Real mode offset register */
   #define SPRN_HSRR0    0x13A   /* Hypervisor Save/Restore 0 */
   #define SPRN_HSRR1    0x13B   /* Hypervisor Save/Restore 1 */
+ +#define SPRN_FSCR     0x099   /* Facility Status & Control Register */
+ +#define FSCR_TAR      (1<<8)  /* Enable Target Adress Register */
+ +#define SPRN_TAR      0x32f   /* Target Address Register */
   #define SPRN_LPCR     0x13E   /* LPAR Control Register */
   #define   LPCR_VPM0   (1ul << (63-0))
   #define   LPCR_VPM1   (1ul << (63-1))
@@@ -282,8 -249,6 +282,8 @@@
   #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
   #define         LPCR_RMLS_SH  (63-37)
   #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
+ +#define   LPCR_AIL_0  0x00000000      /* MMU off exception offset 0x0 */
+ +#define   LPCR_AIL_3  0x01800000      /* MMU on exception offset 0xc00...4xxx */
   #define   LPCR_PECE   0x00007000      /* powersave exit cause enable */
   #define     LPCR_PECE0        0x00004000      /* ext. exceptions can cause exit */
   #define     LPCR_PECE1        0x00002000      /* decrementer can cause exit */
@@@ -322,7 -287,6 +322,7 @@@
   #define SPRN_DBAT6U   0x23C   /* Data BAT 6 Upper Register */
   #define SPRN_DBAT7L   0x23F   /* Data BAT 7 Lower Register */
   #define SPRN_DBAT7U   0x23E   /* Data BAT 7 Upper Register */
+ +#define SPRN_PPR      0x380   /* SMT Thread status Register */
   
   #define SPRN_DEC      0x016           /* Decrement Register */
   #define SPRN_DER      0x095           /* Debug Enable Regsiter */
@@@ -517,7 -481,6 +517,7 @@@
   #ifndef SPRN_PIR
   #define SPRN_PIR      0x3FF   /* Processor Identification Register */
   #endif
+ +#define SPRN_TIR      0x1BE   /* Thread Identification Register */
   #define SPRN_PTEHI    0x3D5   /* 981 7450 PTE HI word (S/W TLB load) */
   #define SPRN_PTELO    0x3D6   /* 982 7450 PTE LO word (S/W TLB load) */
   #define SPRN_PURR     0x135   /* Processor Utilization of Resources Reg */
@@@ -798,7 -761,7 +798,7 @@@
    *        HV mode in which case it is HSPRG0
    *
    * 64-bit server:
- - *    - SPRG0 unused (reserved for HV on Power4)
+ + *    - SPRG0 scratch for TM recheckpoint/reclaim (reserved for HV on Power4)
    *    - SPRG2 scratch for exception vectors
    *    - SPRG3 CPU and NUMA node for VDSO getcpu (user visible)
    *      - HSPRG0 stores PACA in HV mode
@@@ -956,8 -919,6 +956,6 @@@
   #define SPRN_SPRG_RSCRATCH_DBG        SPRN_SPRG9
   #define SPRN_SPRG_WSCRATCH_DBG        SPRN_SPRG9
   #endif
- #define SPRN_SPRG_RVCPU               SPRN_SPRG1
- #define SPRN_SPRG_WVCPU               SPRN_SPRG1
   #endif
   
   #ifdef CONFIG_8xx
@@@ -1067,7 -1028,6 +1065,7 @@@
   #define PVR_970MP     0x0044
   #define PVR_970GX     0x0045
   #define PVR_POWER7p   0x004A
+ +#define PVR_POWER8    0x004B
   #define PVR_BE                0x0070
   #define PVR_PA6T      0x0090
   
diff --combined arch/powerpc/kernel/asm-offsets.c

index 781190367292514e9d1fd39fdda0ab62fea54a3b,46f6afd2172aac2b006c21b9c9a870a624bad24b..b6c17ec9b1691c80de11785b9ea850930d778298
--- 1/arch/powerpc/kernel/asm-offsets.c
--- 2/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@@ -77,7 -77,6 +77,7 @@@ int main(void
         DEFINE(NMI_MASK, NMI_MASK);
         DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr));
         DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit));
+ +      DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr));
   #else
         DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
   #endif /* CONFIG_PPC64 */
@@@ -118,38 -117,10 +118,38 @@@
   #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
         DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
   #endif
- #ifdef CONFIG_KVM_BOOKE_HV
+ #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
         DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
   #endif
   
+ +#ifdef CONFIG_PPC_BOOK3S_64
+ +      DEFINE(THREAD_TAR, offsetof(struct thread_struct, tar));
+ +#endif
+ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ +      DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
+ +      DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar));
+ +      DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr));
+ +      DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar));
+ +      DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs));
+ +      DEFINE(THREAD_TRANSACT_VR0, offsetof(struct thread_struct,
+ +                                       transact_vr[0]));
+ +      DEFINE(THREAD_TRANSACT_VSCR, offsetof(struct thread_struct,
+ +                                        transact_vscr));
+ +      DEFINE(THREAD_TRANSACT_VRSAVE, offsetof(struct thread_struct,
+ +                                          transact_vrsave));
+ +      DEFINE(THREAD_TRANSACT_FPR0, offsetof(struct thread_struct,
+ +                                        transact_fpr[0]));
+ +      DEFINE(THREAD_TRANSACT_FPSCR, offsetof(struct thread_struct,
+ +                                         transact_fpscr));
+ +#ifdef CONFIG_VSX
+ +      DEFINE(THREAD_TRANSACT_VSR0, offsetof(struct thread_struct,
+ +                                        transact_fpr[0]));
+ +#endif
+ +      /* Local pt_regs on stack for Transactional Memory funcs. */
+ +      DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
+ +             sizeof(struct pt_regs) + 16);
+ +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+ +
         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
@@@ -503,7 -474,6 +503,7 @@@
         DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
         DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
         DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+ +      DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
         DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
         DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
         DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
@@@ -583,10 -553,6 +583,10 @@@
         DEFINE(IPI_PRIORITY, IPI_PRIORITY);
   #endif /* CONFIG_KVM_BOOK3S_64_HV */
   
+ +#ifdef CONFIG_PPC_BOOK3S_64
+ +      HSTATE_FIELD(HSTATE_CFAR, cfar);
+ +#endif /* CONFIG_PPC_BOOK3S_64 */
+ +
   #else /* CONFIG_PPC_BOOK3S */
         DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
         DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
diff --combined arch/powerpc/kvm/book3s_pr.c

index 6702442ca81899c2b92948339fd087c5bb15509e,73ed11c41bacfe691f60607c5519b179ae13f3cf..5e93438afb068c89b27062cec2bb772d4e77ce4c
--- 1/arch/powerpc/kvm/book3s_pr.c
--- 2/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@@ -34,8 -34,6 +34,8 @@@
   #include <asm/kvm_book3s.h>
   #include <asm/mmu_context.h>
   #include <asm/switch_to.h>
+ +#include <asm/firmware.h>
+ +#include <asm/hvcall.h>
   #include <linux/gfp.h>
   #include <linux/sched.h>
   #include <linux/vmalloc.h>
@@@ -762,6 -760,11 +762,11 @@@ program_interrupt
                         run->exit_reason = KVM_EXIT_MMIO;
                         r = RESUME_HOST_NV;
                         break;
+               case EMULATE_DO_PAPR:
+                       run->exit_reason = KVM_EXIT_PAPR_HCALL;
+                       vcpu->arch.hcall_needed = 1;
+                       r = RESUME_HOST_NV;
+                       break;
                 default:
                         BUG();
                 }
@@@ -1286,21 -1289,12 +1291,21 @@@ void kvmppc_core_flush_memslot(struct k
   {
   }
   
+ +static unsigned int kvm_global_user_count = 0;
+ +static DEFINE_SPINLOCK(kvm_global_user_count_lock);
+ +
   int kvmppc_core_init_vm(struct kvm *kvm)
   {
   #ifdef CONFIG_PPC64
         INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
   #endif
   
+ +      if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+ +              spin_lock(&kvm_global_user_count_lock);
+ +              if (++kvm_global_user_count == 1)
+ +                      pSeries_disable_reloc_on_exc();
+ +              spin_unlock(&kvm_global_user_count_lock);
+ +      }
         return 0;
   }
   
@@@ -1309,14 -1303,6 +1314,14 @@@ void kvmppc_core_destroy_vm(struct kvm 
   #ifdef CONFIG_PPC64
         WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
   #endif
+ +
+ +      if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+ +              spin_lock(&kvm_global_user_count_lock);
+ +              BUG_ON(kvm_global_user_count == 0);
+ +              if (--kvm_global_user_count == 0)
+ +                      pSeries_enable_reloc_on_exc();
+ +              spin_unlock(&kvm_global_user_count_lock);
+ +      }
   }
   
   static int kvmppc_book3s_init(void)
diff --combined arch/powerpc/kvm/emulate.c

index 9d9cddc5b346ff9518009539c08804b1cf0f9682,71abcf4e2bdafa13a8226bea831a49545c9c2810..7a73b6f72a8ba4a9031d3426c93b32abc03e0951
--- 1/arch/powerpc/kvm/emulate.c
--- 2/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@@ -39,7 -39,6 +39,7 @@@
   #define OP_31_XOP_TRAP      4
   #define OP_31_XOP_LWZX      23
   #define OP_31_XOP_TRAP_64   68
+ +#define OP_31_XOP_DCBF      86
   #define OP_31_XOP_LBZX      87
   #define OP_31_XOP_STWX      151
   #define OP_31_XOP_STBX      215
@@@ -150,8 -149,6 +150,6 @@@ static int kvmppc_emulate_mtspr(struct 
         case SPRN_TBWL: break;
         case SPRN_TBWU: break;
   
-       case SPRN_MSSSR0: break;
- 
         case SPRN_DEC:
                 vcpu->arch.dec = spr_val;
                 kvmppc_emulate_dec(vcpu);
@@@ -202,9 -199,6 +200,6 @@@ static int kvmppc_emulate_mfspr(struct 
         case SPRN_PIR:
                 spr_val = vcpu->vcpu_id;
                 break;
-       case SPRN_MSSSR0:
-               spr_val = 0;
-               break;
   
         /* Note: mftb and TBRL/TBWL are user-accessible, so
          * the guest can always access the real TB anyways.
@@@ -375,7 -369,6 +370,7 @@@ int kvmppc_emulate_instruction(struct k
                         emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
                         break;
   
+ +              case OP_31_XOP_DCBF:
                 case OP_31_XOP_DCBI:
                         /* Do nothing. The guest is performing dcbi because
                          * hardware DMA is not snooped by the dcache, but
diff --combined arch/s390/include/asm/irq.h

index 7def77302d630995a1018cd2ffe02a05ee289e10,aa6d0d74cec91ca06d409655a8d5bb2a60cb85e7..87c17bfb2968e8423fed70784417b36384754f91
--- 1/arch/s390/include/asm/irq.h
--- 2/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@@ -2,61 -2,44 +2,62 @@@
   #define _ASM_IRQ_H
   
   #include <linux/hardirq.h>
+ +#include <linux/percpu.h>
+ +#include <linux/cache.h>
   #include <linux/types.h>
   
- -enum interruption_class {
+ +enum interruption_main_class {
         EXTERNAL_INTERRUPT,
         IO_INTERRUPT,
- -      EXTINT_CLK,
- -      EXTINT_EXC,
- -      EXTINT_EMS,
- -      EXTINT_TMR,
- -      EXTINT_TLA,
- -      EXTINT_PFL,
- -      EXTINT_DSD,
- -      EXTINT_VRT,
- -      EXTINT_SCP,
- -      EXTINT_IUC,
- -      EXTINT_CMS,
- -      EXTINT_CMC,
- -      EXTINT_CMR,
- -      IOINT_CIO,
- -      IOINT_QAI,
- -      IOINT_DAS,
- -      IOINT_C15,
- -      IOINT_C70,
- -      IOINT_TAP,
- -      IOINT_VMR,
- -      IOINT_LCS,
- -      IOINT_CLW,
- -      IOINT_CTC,
- -      IOINT_APB,
- -      IOINT_ADM,
- -      IOINT_CSC,
- -      IOINT_PCI,
- -      IOINT_MSI,
- -      IOINT_VIR,
+ +      NR_IRQS
+ +};
+ +
+ +enum interruption_class {
+ +      IRQEXT_CLK,
+ +      IRQEXT_EXC,
+ +      IRQEXT_EMS,
+ +      IRQEXT_TMR,
+ +      IRQEXT_TLA,
+ +      IRQEXT_PFL,
+ +      IRQEXT_DSD,
+ +      IRQEXT_VRT,
+ +      IRQEXT_SCP,
+ +      IRQEXT_IUC,
+ +      IRQEXT_CMS,
+ +      IRQEXT_CMC,
+ +      IRQEXT_CMR,
+ +      IRQIO_CIO,
+ +      IRQIO_QAI,
+ +      IRQIO_DAS,
+ +      IRQIO_C15,
+ +      IRQIO_C70,
+ +      IRQIO_TAP,
+ +      IRQIO_VMR,
+ +      IRQIO_LCS,
+ +      IRQIO_CLW,
+ +      IRQIO_CTC,
+ +      IRQIO_APB,
+ +      IRQIO_ADM,
+ +      IRQIO_CSC,
+ +      IRQIO_PCI,
+ +      IRQIO_MSI,
++      IRQIO_VIR,
         NMI_NMI,
- -      NR_IRQS,
+ +      CPU_RST,
+ +      NR_ARCH_IRQS
   };
   
+ +struct irq_stat {
+ +      unsigned int irqs[NR_ARCH_IRQS];
+ +};
+ +
+ +DECLARE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
+ +
+ +static __always_inline void inc_irq_stat(enum interruption_class irq)
+ +{
+ +      __get_cpu_var(irq_stat).irqs[irq]++;
+ +}
+ +
   struct ext_code {
         unsigned short subcode;
         unsigned short code;
diff --combined arch/s390/kernel/irq.c

index 9df824ea16672aea9e8f6a93ec6c925f7b982a3a,a9806ea3ebd7244db2a45b370927946fdecbfe01..1630f439cd2a567d27d0ddb5828917808fec5f15
--- 1/arch/s390/kernel/irq.c
--- 2/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@@ -24,65 -24,44 +24,66 @@@
   #include <asm/irq.h>
   #include "entry.h"
   
+ +DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
+ +EXPORT_PER_CPU_SYMBOL_GPL(irq_stat);
+ +
   struct irq_class {
         char *name;
         char *desc;
   };
   
- -static const struct irq_class intrclass_names[] = {
+ +/*
+ + * The list of "main" irq classes on s390. This is the list of interrrupts
+ + * that appear both in /proc/stat ("intr" line) and /proc/interrupts.
+ + * Historically only external and I/O interrupts have been part of /proc/stat.
+ + * We can't add the split external and I/O sub classes since the first field
+ + * in the "intr" line in /proc/stat is supposed to be the sum of all other
+ + * fields.
+ + * Since the external and I/O interrupt fields are already sums we would end
+ + * up with having a sum which accounts each interrupt twice.
+ + */
+ +static const struct irq_class irqclass_main_desc[NR_IRQS] = {
         [EXTERNAL_INTERRUPT] = {.name = "EXT"},
- -      [IO_INTERRUPT]       = {.name = "I/O"},
- -      [EXTINT_CLK] = {.name = "CLK", .desc = "[EXT] Clock Comparator"},
- -      [EXTINT_EXC] = {.name = "EXC", .desc = "[EXT] External Call"},
- -      [EXTINT_EMS] = {.name = "EMS", .desc = "[EXT] Emergency Signal"},
- -      [EXTINT_TMR] = {.name = "TMR", .desc = "[EXT] CPU Timer"},
- -      [EXTINT_TLA] = {.name = "TAL", .desc = "[EXT] Timing Alert"},
- -      [EXTINT_PFL] = {.name = "PFL", .desc = "[EXT] Pseudo Page Fault"},
- -      [EXTINT_DSD] = {.name = "DSD", .desc = "[EXT] DASD Diag"},
- -      [EXTINT_VRT] = {.name = "VRT", .desc = "[EXT] Virtio"},
- -      [EXTINT_SCP] = {.name = "SCP", .desc = "[EXT] Service Call"},
- -      [EXTINT_IUC] = {.name = "IUC", .desc = "[EXT] IUCV"},
- -      [EXTINT_CMS] = {.name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"},
- -      [EXTINT_CMC] = {.name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"},
- -      [EXTINT_CMR] = {.name = "CMR", .desc = "[EXT] CPU-Measurement: RI"},
- -      [IOINT_CIO]  = {.name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"},
- -      [IOINT_QAI]  = {.name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"},
- -      [IOINT_DAS]  = {.name = "DAS", .desc = "[I/O] DASD"},
- -      [IOINT_C15]  = {.name = "C15", .desc = "[I/O] 3215"},
- -      [IOINT_C70]  = {.name = "C70", .desc = "[I/O] 3270"},
- -      [IOINT_TAP]  = {.name = "TAP", .desc = "[I/O] Tape"},
- -      [IOINT_VMR]  = {.name = "VMR", .desc = "[I/O] Unit Record Devices"},
- -      [IOINT_LCS]  = {.name = "LCS", .desc = "[I/O] LCS"},
- -      [IOINT_CLW]  = {.name = "CLW", .desc = "[I/O] CLAW"},
- -      [IOINT_CTC]  = {.name = "CTC", .desc = "[I/O] CTC"},
- -      [IOINT_APB]  = {.name = "APB", .desc = "[I/O] AP Bus"},
- -      [IOINT_ADM]  = {.name = "ADM", .desc = "[I/O] EADM Subchannel"},
- -      [IOINT_CSC]  = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"},
- -      [IOINT_PCI]  = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
- -      [IOINT_MSI] =  {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
- -      [IOINT_VIR]  = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
+ +      [IO_INTERRUPT]       = {.name = "I/O"}
+ +};
+ +
+ +/*
+ + * The list of split external and I/O interrupts that appear only in
+ + * /proc/interrupts.
+ + * In addition this list contains non external / I/O events like NMIs.
+ + */
+ +static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
+ +      [IRQEXT_CLK] = {.name = "CLK", .desc = "[EXT] Clock Comparator"},
+ +      [IRQEXT_EXC] = {.name = "EXC", .desc = "[EXT] External Call"},
+ +      [IRQEXT_EMS] = {.name = "EMS", .desc = "[EXT] Emergency Signal"},
+ +      [IRQEXT_TMR] = {.name = "TMR", .desc = "[EXT] CPU Timer"},
+ +      [IRQEXT_TLA] = {.name = "TAL", .desc = "[EXT] Timing Alert"},
+ +      [IRQEXT_PFL] = {.name = "PFL", .desc = "[EXT] Pseudo Page Fault"},
+ +      [IRQEXT_DSD] = {.name = "DSD", .desc = "[EXT] DASD Diag"},
+ +      [IRQEXT_VRT] = {.name = "VRT", .desc = "[EXT] Virtio"},
+ +      [IRQEXT_SCP] = {.name = "SCP", .desc = "[EXT] Service Call"},
+ +      [IRQEXT_IUC] = {.name = "IUC", .desc = "[EXT] IUCV"},
+ +      [IRQEXT_CMS] = {.name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"},
+ +      [IRQEXT_CMC] = {.name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"},
+ +      [IRQEXT_CMR] = {.name = "CMR", .desc = "[EXT] CPU-Measurement: RI"},
+ +      [IRQIO_CIO]  = {.name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"},
+ +      [IRQIO_QAI]  = {.name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"},
+ +      [IRQIO_DAS]  = {.name = "DAS", .desc = "[I/O] DASD"},
+ +      [IRQIO_C15]  = {.name = "C15", .desc = "[I/O] 3215"},
+ +      [IRQIO_C70]  = {.name = "C70", .desc = "[I/O] 3270"},
+ +      [IRQIO_TAP]  = {.name = "TAP", .desc = "[I/O] Tape"},
+ +      [IRQIO_VMR]  = {.name = "VMR", .desc = "[I/O] Unit Record Devices"},
+ +      [IRQIO_LCS]  = {.name = "LCS", .desc = "[I/O] LCS"},
+ +      [IRQIO_CLW]  = {.name = "CLW", .desc = "[I/O] CLAW"},
+ +      [IRQIO_CTC]  = {.name = "CTC", .desc = "[I/O] CTC"},
+ +      [IRQIO_APB]  = {.name = "APB", .desc = "[I/O] AP Bus"},
+ +      [IRQIO_ADM]  = {.name = "ADM", .desc = "[I/O] EADM Subchannel"},
+ +      [IRQIO_CSC]  = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"},
+ +      [IRQIO_PCI]  = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
+ +      [IRQIO_MSI]  = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
++      [IRQIO_VIR]  = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
         [NMI_NMI]    = {.name = "NMI", .desc = "[NMI] Machine Check"},
+ +      [CPU_RST]    = {.name = "RST", .desc = "[CPU] CPU Restart"},
   };
   
   /*
@@@ -90,34 -69,30 +91,34 @@@
    */
   int show_interrupts(struct seq_file *p, void *v)
   {
- -      int i = *(loff_t *) v, j;
+ +      int irq = *(loff_t *) v;
+ +      int cpu;
   
         get_online_cpus();
- -      if (i == 0) {
+ +      if (irq == 0) {
                 seq_puts(p, "           ");
- -              for_each_online_cpu(j)
- -                      seq_printf(p, "CPU%d       ",j);
+ +              for_each_online_cpu(cpu)
+ +                      seq_printf(p, "CPU%d       ", cpu);
                 seq_putc(p, '\n');
         }
- -
- -      if (i < NR_IRQS) {
- -              seq_printf(p, "%s: ", intrclass_names[i].name);
- -#ifndef CONFIG_SMP
- -              seq_printf(p, "%10u ", kstat_irqs(i));
- -#else
- -              for_each_online_cpu(j)
- -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
- -#endif
- -              if (intrclass_names[i].desc)
- -                      seq_printf(p, "  %s", intrclass_names[i].desc);
- -                seq_putc(p, '\n');
- -        }
+ +      if (irq < NR_IRQS) {
+ +              seq_printf(p, "%s: ", irqclass_main_desc[irq].name);
+ +              for_each_online_cpu(cpu)
+ +                      seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[irq]);
+ +              seq_putc(p, '\n');
+ +              goto skip_arch_irqs;
+ +      }
+ +      for (irq = 0; irq < NR_ARCH_IRQS; irq++) {
+ +              seq_printf(p, "%s: ", irqclass_sub_desc[irq].name);
+ +              for_each_online_cpu(cpu)
+ +                      seq_printf(p, "%10u ", per_cpu(irq_stat, cpu).irqs[irq]);
+ +              if (irqclass_sub_desc[irq].desc)
+ +                      seq_printf(p, "  %s", irqclass_sub_desc[irq].desc);
+ +              seq_putc(p, '\n');
+ +      }
+ +skip_arch_irqs:
         put_online_cpus();
- -        return 0;
+ +      return 0;
   }
   
   /*
@@@ -248,7 -223,7 +249,7 @@@ void __irq_entry do_extint(struct pt_re
                 /* Serve timer interrupts first. */
                 clock_comparator_work();
         }
- -      kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++;
+ +      kstat_incr_irqs_this_cpu(EXTERNAL_INTERRUPT, NULL);
         if (ext_code.code != 0x1004)
                 __get_cpu_var(s390_idle).nohz_delay = 1;
   
diff --combined arch/s390/kvm/interrupt.c

index 87418b50f21cec676929159f84765152a0395e53,2f6ccb065c4aa47aca03c1f5276ec2b253b01f0b..37116a77cb4b8b16bc8dbb698e34ced1510b03f5
--- 1/arch/s390/kvm/interrupt.c
--- 2/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@@ -21,11 -21,31 +21,31 @@@
   #include "gaccess.h"
   #include "trace-s390.h"
   
+ #define IOINT_SCHID_MASK 0x0000ffff
+ #define IOINT_SSID_MASK 0x00030000
+ #define IOINT_CSSID_MASK 0x03fc0000
+ #define IOINT_AI_MASK 0x04000000
+ 
+ static int is_ioint(u64 type)
+ {
+       return ((type & 0xfffe0000u) != 0xfffe0000u);
+ }
+ 
   static int psw_extint_disabled(struct kvm_vcpu *vcpu)
   {
         return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
   }
   
+ static int psw_ioint_disabled(struct kvm_vcpu *vcpu)
+ {
+       return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO);
+ }
+ 
+ static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
+ {
+       return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK);
+ }
+ 
   static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
   {
         if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
@@@ -35,6 -55,13 +55,13 @@@
         return 1;
   }
   
+ static u64 int_word_to_isc_bits(u32 int_word)
+ {
+       u8 isc = (int_word & 0x38000000) >> 27;
+ 
+       return (0x80 >> isc) << 24;
+ }
+ 
   static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
                                       struct kvm_s390_interrupt_info *inti)
   {
@@@ -67,7 -94,22 +94,22 @@@
         case KVM_S390_SIGP_SET_PREFIX:
         case KVM_S390_RESTART:
                 return 1;
+       case KVM_S390_MCHK:
+               if (psw_mchk_disabled(vcpu))
+                       return 0;
+               if (vcpu->arch.sie_block->gcr[14] & inti->mchk.cr14)
+                       return 1;
+               return 0;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               if (psw_ioint_disabled(vcpu))
+                       return 0;
+               if (vcpu->arch.sie_block->gcr[6] &
+                   int_word_to_isc_bits(inti->io.io_int_word))
+                       return 1;
+               return 0;
         default:
+               printk(KERN_WARNING "illegal interrupt type %llx\n",
+                      inti->type);
                 BUG();
         }
         return 0;
@@@ -93,6 -135,7 +135,7 @@@ static void __reset_intercept_indicator
                 CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
                 &vcpu->arch.sie_block->cpuflags);
         vcpu->arch.sie_block->lctl = 0x0000;
+       vcpu->arch.sie_block->ictl &= ~ICTL_LPSW;
   }
   
   static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@@ -116,6 -159,18 +159,18 @@@ static void __set_intercept_indicator(s
         case KVM_S390_SIGP_STOP:
                 __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
                 break;
+       case KVM_S390_MCHK:
+               if (psw_mchk_disabled(vcpu))
+                       vcpu->arch.sie_block->ictl |= ICTL_LPSW;
+               else
+                       vcpu->arch.sie_block->lctl |= LCTL_CR14;
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               if (psw_ioint_disabled(vcpu))
+                       __set_cpuflag(vcpu, CPUSTAT_IO_INT);
+               else
+                       vcpu->arch.sie_block->lctl |= LCTL_CR6;
+               break;
         default:
                 BUG();
         }
@@@ -297,6 -352,73 +352,73 @@@ static void __do_deliver_interrupt(stru
                         exception = 1;
                 break;
   
+       case KVM_S390_MCHK:
+               VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
+                          inti->mchk.mcic);
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                inti->mchk.cr14,
+                                                inti->mchk.mcic);
+               rc = kvm_s390_vcpu_store_status(vcpu,
+                                               KVM_S390_STORE_STATUS_PREFIXED);
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic);
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
+                                  &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                    __LC_MCK_NEW_PSW, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+               break;
+ 
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+       {
+               __u32 param0 = ((__u32)inti->io.subchannel_id << 16) |
+                       inti->io.subchannel_nr;
+               __u64 param1 = ((__u64)inti->io.io_int_parm << 32) |
+                       inti->io.io_int_word;
+               VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
+               vcpu->stat.deliver_io_int++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+                                                param0, param1);
+               rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID,
+                                  inti->io.subchannel_id);
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR,
+                                  inti->io.subchannel_nr);
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = put_guest_u32(vcpu, __LC_IO_INT_PARM,
+                                  inti->io.io_int_parm);
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = put_guest_u32(vcpu, __LC_IO_INT_WORD,
+                                  inti->io.io_int_word);
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW,
+                                  &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+ 
+               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                    __LC_IO_NEW_PSW, sizeof(psw_t));
+               if (rc == -EFAULT)
+                       exception = 1;
+               break;
+       }
         default:
                 BUG();
         }
@@@ -362,7 -484,7 +484,7 @@@ static int kvm_cpu_has_interrupt(struc
         }
   
         if ((!rc) && (vcpu->arch.sie_block->ckc <
- -              get_clock() + vcpu->arch.sie_block->epoch)) {
+ +              get_tod_clock() + vcpu->arch.sie_block->epoch)) {
                 if ((!psw_extint_disabled(vcpu)) &&
                         (vcpu->arch.sie_block->gcr[0] & 0x800ul))
                         rc = 1;
@@@ -402,13 -524,13 +524,13 @@@ int kvm_s390_handle_wait(struct kvm_vcp
                 goto no_timer;
         }
   
- -      now = get_clock() + vcpu->arch.sie_block->epoch;
+ +      now = get_tod_clock() + vcpu->arch.sie_block->epoch;
         if (vcpu->arch.sie_block->ckc < now) {
                 __unset_cpu_idle(vcpu);
                 return 0;
         }
   
- -      sltime = ((vcpu->arch.sie_block->ckc - now)*125)>>9;
+ +      sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
   
         hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
         VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
@@@ -492,7 -614,7 +614,7 @@@ void kvm_s390_deliver_pending_interrupt
         }
   
         if ((vcpu->arch.sie_block->ckc <
- -              get_clock() + vcpu->arch.sie_block->epoch))
+ +              get_tod_clock() + vcpu->arch.sie_block->epoch))
                 __try_deliver_ckc_interrupt(vcpu);
   
         if (atomic_read(&fi->active)) {
@@@ -518,6 -640,61 +640,61 @@@
         }
   }
   
+ void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
+       struct kvm_s390_interrupt_info  *n, *inti = NULL;
+       int deliver;
+ 
+       __reset_intercept_indicators(vcpu);
+       if (atomic_read(&li->active)) {
+               do {
+                       deliver = 0;
+                       spin_lock_bh(&li->lock);
+                       list_for_each_entry_safe(inti, n, &li->list, list) {
+                               if ((inti->type == KVM_S390_MCHK) &&
+                                   __interrupt_is_deliverable(vcpu, inti)) {
+                                       list_del(&inti->list);
+                                       deliver = 1;
+                                       break;
+                               }
+                               __set_intercept_indicator(vcpu, inti);
+                       }
+                       if (list_empty(&li->list))
+                               atomic_set(&li->active, 0);
+                       spin_unlock_bh(&li->lock);
+                       if (deliver) {
+                               __do_deliver_interrupt(vcpu, inti);
+                               kfree(inti);
+                       }
+               } while (deliver);
+       }
+ 
+       if (atomic_read(&fi->active)) {
+               do {
+                       deliver = 0;
+                       spin_lock(&fi->lock);
+                       list_for_each_entry_safe(inti, n, &fi->list, list) {
+                               if ((inti->type == KVM_S390_MCHK) &&
+                                   __interrupt_is_deliverable(vcpu, inti)) {
+                                       list_del(&inti->list);
+                                       deliver = 1;
+                                       break;
+                               }
+                               __set_intercept_indicator(vcpu, inti);
+                       }
+                       if (list_empty(&fi->list))
+                               atomic_set(&fi->active, 0);
+                       spin_unlock(&fi->lock);
+                       if (deliver) {
+                               __do_deliver_interrupt(vcpu, inti);
+                               kfree(inti);
+                       }
+               } while (deliver);
+       }
+ }
+ 
   int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
   {
         struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@@ -540,12 -717,50 +717,50 @@@
         return 0;
   }
   
+ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
+                                                   u64 cr6, u64 schid)
+ {
+       struct kvm_s390_float_interrupt *fi;
+       struct kvm_s390_interrupt_info *inti, *iter;
+ 
+       if ((!schid && !cr6) || (schid && cr6))
+               return NULL;
+       mutex_lock(&kvm->lock);
+       fi = &kvm->arch.float_int;
+       spin_lock(&fi->lock);
+       inti = NULL;
+       list_for_each_entry(iter, &fi->list, list) {
+               if (!is_ioint(iter->type))
+                       continue;
+               if (cr6 &&
+                   ((cr6 & int_word_to_isc_bits(iter->io.io_int_word)) == 0))
+                       continue;
+               if (schid) {
+                       if (((schid & 0x00000000ffff0000) >> 16) !=
+                           iter->io.subchannel_id)
+                               continue;
+                       if ((schid & 0x000000000000ffff) !=
+                           iter->io.subchannel_nr)
+                               continue;
+               }
+               inti = iter;
+               break;
+       }
+       if (inti)
+               list_del_init(&inti->list);
+       if (list_empty(&fi->list))
+               atomic_set(&fi->active, 0);
+       spin_unlock(&fi->lock);
+       mutex_unlock(&kvm->lock);
+       return inti;
+ }
+ 
   int kvm_s390_inject_vm(struct kvm *kvm,
                        struct kvm_s390_interrupt *s390int)
   {
         struct kvm_s390_local_interrupt *li;
         struct kvm_s390_float_interrupt *fi;
-       struct kvm_s390_interrupt_info *inti;
+       struct kvm_s390_interrupt_info *inti, *iter;
         int sigcpu;
   
         inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@@ -569,6 -784,29 +784,29 @@@
         case KVM_S390_SIGP_STOP:
         case KVM_S390_INT_EXTERNAL_CALL:
         case KVM_S390_INT_EMERGENCY:
+               kfree(inti);
+               return -EINVAL;
+       case KVM_S390_MCHK:
+               VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
+                        s390int->parm64);
+               inti->type = s390int->type;
+               inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
+               inti->mchk.mcic = s390int->parm64;
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               if (s390int->type & IOINT_AI_MASK)
+                       VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
+               else
+                       VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
+                                s390int->type & IOINT_CSSID_MASK,
+                                s390int->type & IOINT_SSID_MASK,
+                                s390int->type & IOINT_SCHID_MASK);
+               inti->type = s390int->type;
+               inti->io.subchannel_id = s390int->parm >> 16;
+               inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
+               inti->io.io_int_parm = s390int->parm64 >> 32;
+               inti->io.io_int_word = s390int->parm64 & 0x00000000ffffffffull;
+               break;
         default:
                 kfree(inti);
                 return -EINVAL;
@@@ -579,7 -817,22 +817,22 @@@
         mutex_lock(&kvm->lock);
         fi = &kvm->arch.float_int;
         spin_lock(&fi->lock);
-       list_add_tail(&inti->list, &fi->list);
+       if (!is_ioint(inti->type))
+               list_add_tail(&inti->list, &fi->list);
+       else {
+               u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+ 
+               /* Keep I/O interrupts sorted in isc order. */
+               list_for_each_entry(iter, &fi->list, list) {
+                       if (!is_ioint(iter->type))
+                               continue;
+                       if (int_word_to_isc_bits(iter->io.io_int_word)
+                           <= isc_bits)
+                               continue;
+                       break;
+               }
+               list_add_tail(&inti->list, &iter->list);
+       }
         atomic_set(&fi->active, 1);
         sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
         if (sigcpu == KVM_MAX_VCPUS) {
@@@ -651,8 -904,15 +904,15 @@@ int kvm_s390_inject_vcpu(struct kvm_vcp
                 inti->type = s390int->type;
                 inti->emerg.code = s390int->parm;
                 break;
+       case KVM_S390_MCHK:
+               VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
+                          s390int->parm64);
+               inti->type = s390int->type;
+               inti->mchk.mcic = s390int->parm64;
+               break;
         case KVM_S390_INT_VIRTIO:
         case KVM_S390_INT_SERVICE:
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
         default:
                 kfree(inti);
                 return -EINVAL;
diff --combined arch/s390/kvm/kvm-s390.c

index 2923781590a610a49a7efa33f4f5a8758011774e,4377d18866319e96c3d2ecf5e313701283fe96cb..4cf35a0a79e7734b6cd99803f3f54bd2df5c6396
--- 1/arch/s390/kvm/kvm-s390.c
--- 2/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -140,6 -140,8 +140,8 @@@ int kvm_dev_ioctl_check_extension(long 
   #endif
         case KVM_CAP_SYNC_REGS:
         case KVM_CAP_ONE_REG:
+       case KVM_CAP_ENABLE_CAP:
+       case KVM_CAP_S390_CSS_SUPPORT:
                 r = 1;
                 break;
         case KVM_CAP_NR_VCPUS:
@@@ -147,7 -149,7 +149,7 @@@
                 r = KVM_MAX_VCPUS;
                 break;
         case KVM_CAP_S390_COW:
- -              r = sclp_get_fac85() & 0x2;
+ +              r = MACHINE_HAS_ESOP;
                 break;
         default:
                 r = 0;
@@@ -234,6 -236,9 +236,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
                 if (!kvm->arch.gmap)
                         goto out_nogmap;
         }
+ 
+       kvm->arch.css_support = 0;
+ 
         return 0;
   out_nogmap:
         debug_unregister(kvm->arch.dbf);
@@@ -613,9 -618,7 +618,9 @@@ static int __vcpu_run(struct kvm_vcpu *
                 kvm_s390_deliver_pending_interrupts(vcpu);
   
         vcpu->arch.sie_block->icptcode = 0;
+ +      preempt_disable();
         kvm_guest_enter();
+ +      preempt_enable();
         VCPU_EVENT(vcpu, 6, "entering sie flags %x",
                    atomic_read(&vcpu->arch.sie_block->cpuflags));
         trace_kvm_s390_sie_enter(vcpu,
@@@ -659,6 -662,7 +664,7 @@@ rerun_vcpu
         case KVM_EXIT_INTR:
         case KVM_EXIT_S390_RESET:
         case KVM_EXIT_S390_UCONTROL:
+       case KVM_EXIT_S390_TSCH:
                 break;
         default:
                 BUG();
@@@ -766,6 -770,14 +772,14 @@@ int kvm_s390_vcpu_store_status(struct k
         } else
                 prefix = 0;
   
+       /*
+        * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
+        * copying in vcpu load/put. Lets update our copies before we save
+        * it into the save area
+        */
+       save_fp_regs(&vcpu->arch.guest_fpregs);
+       save_access_regs(vcpu->run->s.regs.acrs);
+ 
         if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
                         vcpu->arch.guest_fpregs.fprs, 128, prefix))
                 return -EFAULT;
@@@ -810,6 -822,29 +824,29 @@@
         return 0;
   }
   
+ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+                                    struct kvm_enable_cap *cap)
+ {
+       int r;
+ 
+       if (cap->flags)
+               return -EINVAL;
+ 
+       switch (cap->cap) {
+       case KVM_CAP_S390_CSS_SUPPORT:
+               if (!vcpu->kvm->arch.css_support) {
+                       vcpu->kvm->arch.css_support = 1;
+                       trace_kvm_s390_enable_css(vcpu->kvm);
+               }
+               r = 0;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+ }
+ 
   long kvm_arch_vcpu_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
   {
@@@ -896,6 -931,15 +933,15 @@@
                         r = 0;
                 break;
         }
+       case KVM_ENABLE_CAP:
+       {
+               struct kvm_enable_cap cap;
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       break;
+               r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+               break;
+       }
         default:
                 r = -ENOTTY;
         }
@@@ -930,7 -974,7 +976,7 @@@ int kvm_arch_prepare_memory_region(stru
                                    struct kvm_memory_slot *memslot,
                                    struct kvm_memory_slot old,
                                    struct kvm_userspace_memory_region *mem,
-                                  int user_alloc)
+                                  bool user_alloc)
   {
         /* A few sanity checks. We can have exactly one memory slot which has
            to start at guest virtual zero and which has to be located at a
@@@ -960,7 -1004,7 +1006,7 @@@
   void kvm_arch_commit_memory_region(struct kvm *kvm,
                                 struct kvm_userspace_memory_region *mem,
                                 struct kvm_memory_slot old,
-                               int user_alloc)
+                               bool user_alloc)
   {
         int rc;
   
diff --combined arch/x86/include/asm/kvm_para.h

index 65231e173bafceb5895f49da00b95f39c3088beb,f49c16d47581dcab236ccd096c6d0ecd322b2a17..695399f2d5eb315a62823a9fb8b1673a90447b04
--- 1/arch/x86/include/asm/kvm_para.h
--- 2/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@@ -1,8 -1,103 +1,8 @@@
   #ifndef _ASM_X86_KVM_PARA_H
   #define _ASM_X86_KVM_PARA_H
   
- -#include <linux/types.h>
- -#include <asm/hyperv.h>
- -
- -/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
- - * should be used to determine that a VM is running under KVM.
- - */
- -#define KVM_CPUID_SIGNATURE   0x40000000
- -
- -/* This CPUID returns a feature bitmap in eax.  Before enabling a particular
- - * paravirtualization, the appropriate feature bit should be checked.
- - */
- -#define KVM_CPUID_FEATURES    0x40000001
- -#define KVM_FEATURE_CLOCKSOURCE               0
- -#define KVM_FEATURE_NOP_IO_DELAY      1
- -#define KVM_FEATURE_MMU_OP            2
- -/* This indicates that the new set of kvmclock msrs
- - * are available. The use of 0x11 and 0x12 is deprecated
- - */
- -#define KVM_FEATURE_CLOCKSOURCE2        3
- -#define KVM_FEATURE_ASYNC_PF          4
- -#define KVM_FEATURE_STEAL_TIME                5
- -#define KVM_FEATURE_PV_EOI            6
- -
- -/* The last 8 bits are used to indicate how to interpret the flags field
- - * in pvclock structure. If no bits are set, all flags are ignored.
- - */
- -#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT    24
- -
- -#define MSR_KVM_WALL_CLOCK  0x11
- -#define MSR_KVM_SYSTEM_TIME 0x12
- -
- -#define KVM_MSR_ENABLED 1
- -/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
- -#define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
- -#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
- -#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
- -#define MSR_KVM_STEAL_TIME  0x4b564d03
- -#define MSR_KVM_PV_EOI_EN      0x4b564d04
- -
- -struct kvm_steal_time {
- -      __u64 steal;
- -      __u32 version;
- -      __u32 flags;
- -      __u32 pad[12];
- -};
- -
- -#define KVM_STEAL_ALIGNMENT_BITS 5
- -#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
- -#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
- -
- -#define KVM_MAX_MMU_OP_BATCH           32
- -
- -#define KVM_ASYNC_PF_ENABLED                  (1 << 0)
- -#define KVM_ASYNC_PF_SEND_ALWAYS              (1 << 1)
- -
- -/* Operations for KVM_HC_MMU_OP */
- -#define KVM_MMU_OP_WRITE_PTE            1
- -#define KVM_MMU_OP_FLUSH_TLB          2
- -#define KVM_MMU_OP_RELEASE_PT         3
- -
- -/* Payload for KVM_HC_MMU_OP */
- -struct kvm_mmu_op_header {
- -      __u32 op;
- -      __u32 pad;
- -};
- -
- -struct kvm_mmu_op_write_pte {
- -      struct kvm_mmu_op_header header;
- -      __u64 pte_phys;
- -      __u64 pte_val;
- -};
- -
- -struct kvm_mmu_op_flush_tlb {
- -      struct kvm_mmu_op_header header;
- -};
- -
- -struct kvm_mmu_op_release_pt {
- -      struct kvm_mmu_op_header header;
- -      __u64 pt_phys;
- -};
- -
- -#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
- -#define KVM_PV_REASON_PAGE_READY 2
- -
- -struct kvm_vcpu_pv_apf_data {
- -      __u32 reason;
- -      __u8 pad[60];
- -      __u32 enabled;
- -};
- -
- -#define KVM_PV_EOI_BIT 0
- -#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
- -#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
- -#define KVM_PV_EOI_DISABLED 0x0
- -
- -#ifdef __KERNEL__
   #include <asm/processor.h>
+ +#include <uapi/asm/kvm_para.h>
   
   extern void kvmclock_init(void);
   extern int kvm_register_clock(char *txt);
@@@ -27,7 -122,7 +27,7 @@@ static inline bool kvm_check_and_clear_
    *
    * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
    * The hypercall number should be placed in rax and the return value will be
-  * placed in rax.  No other registers will be clobbered unless explicited
+  * placed in rax.  No other registers will be clobbered unless explicitly
    * noted by the particular hypercall.
    */
   
@@@ -85,13 -180,13 +85,13 @@@ static inline long kvm_hypercall4(unsig
         return ret;
   }
   
- -static inline int kvm_para_available(void)
+ +static inline bool kvm_para_available(void)
   {
         unsigned int eax, ebx, ecx, edx;
         char signature[13];
   
         if (boot_cpu_data.cpuid_level < 0)
- -              return 0;       /* So we don't blow up on old processors */
+ +              return false;   /* So we don't blow up on old processors */
   
         if (cpu_has_hypervisor) {
                 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
@@@ -101,10 -196,10 +101,10 @@@
                 signature[12] = 0;
   
                 if (strcmp(signature, "KVMKVMKVM") == 0)
- -                      return 1;
+ +                      return true;
         }
   
- -      return 0;
+ +      return false;
   }
   
   static inline unsigned int kvm_arch_para_features(void)
@@@ -133,4 -228,6 +133,4 @@@ static inline void kvm_disable_steal_ti
   }
   #endif
   
- -#endif /* __KERNEL__ */
- -
   #endif /* _ASM_X86_KVM_PARA_H */
diff --combined arch/x86/include/asm/vmx.h

index 235b49fa554bd614c6cb6b61c9c3618f1cd91a99,5c9dbadd364a3064072b491c1409216b852f3819..b6fbf860e398ed940bfb1113acd769a0afc827da
--- 1/arch/x86/include/asm/vmx.h
--- 2/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@@ -1,3 -1,6 +1,3 @@@
- -#ifndef VMX_H
- -#define VMX_H
- -
   /*
    * vmx.h: VMX Architecture related definitions
    * Copyright (c) 2004, Intel Corporation.
@@@ -21,12 -24,97 +21,12 @@@
    *    Yaniv Kamay <yaniv@qumranet.com>
    *
    */
+ +#ifndef VMX_H
+ +#define VMX_H
   
- -#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
- -
- -#define EXIT_REASON_EXCEPTION_NMI       0
- -#define EXIT_REASON_EXTERNAL_INTERRUPT  1
- -#define EXIT_REASON_TRIPLE_FAULT        2
- -
- -#define EXIT_REASON_PENDING_INTERRUPT   7
- -#define EXIT_REASON_NMI_WINDOW          8
- -#define EXIT_REASON_TASK_SWITCH         9
- -#define EXIT_REASON_CPUID               10
- -#define EXIT_REASON_HLT                 12
- -#define EXIT_REASON_INVD                13
- -#define EXIT_REASON_INVLPG              14
- -#define EXIT_REASON_RDPMC               15
- -#define EXIT_REASON_RDTSC               16
- -#define EXIT_REASON_VMCALL              18
- -#define EXIT_REASON_VMCLEAR             19
- -#define EXIT_REASON_VMLAUNCH            20
- -#define EXIT_REASON_VMPTRLD             21
- -#define EXIT_REASON_VMPTRST             22
- -#define EXIT_REASON_VMREAD              23
- -#define EXIT_REASON_VMRESUME            24
- -#define EXIT_REASON_VMWRITE             25
- -#define EXIT_REASON_VMOFF               26
- -#define EXIT_REASON_VMON                27
- -#define EXIT_REASON_CR_ACCESS           28
- -#define EXIT_REASON_DR_ACCESS           29
- -#define EXIT_REASON_IO_INSTRUCTION      30
- -#define EXIT_REASON_MSR_READ            31
- -#define EXIT_REASON_MSR_WRITE           32
- -#define EXIT_REASON_INVALID_STATE       33
- -#define EXIT_REASON_MWAIT_INSTRUCTION   36
- -#define EXIT_REASON_MONITOR_INSTRUCTION 39
- -#define EXIT_REASON_PAUSE_INSTRUCTION   40
- -#define EXIT_REASON_MCE_DURING_VMENTRY  41
- -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
- -#define EXIT_REASON_APIC_ACCESS         44
- -#define EXIT_REASON_EOI_INDUCED         45
- -#define EXIT_REASON_EPT_VIOLATION       48
- -#define EXIT_REASON_EPT_MISCONFIG       49
- -#define EXIT_REASON_WBINVD              54
- -#define EXIT_REASON_XSETBV              55
- -#define EXIT_REASON_APIC_WRITE          56
- -#define EXIT_REASON_INVPCID             58
- -
- -#define VMX_EXIT_REASONS \
- -      { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
- -      { EXIT_REASON_EXTERNAL_INTERRUPT,    "EXTERNAL_INTERRUPT" }, \
- -      { EXIT_REASON_TRIPLE_FAULT,          "TRIPLE_FAULT" }, \
- -      { EXIT_REASON_PENDING_INTERRUPT,     "PENDING_INTERRUPT" }, \
- -      { EXIT_REASON_NMI_WINDOW,            "NMI_WINDOW" }, \
- -      { EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
- -      { EXIT_REASON_CPUID,                 "CPUID" }, \
- -      { EXIT_REASON_HLT,                   "HLT" }, \
- -      { EXIT_REASON_INVLPG,                "INVLPG" }, \
- -      { EXIT_REASON_RDPMC,                 "RDPMC" }, \
- -      { EXIT_REASON_RDTSC,                 "RDTSC" }, \
- -      { EXIT_REASON_VMCALL,                "VMCALL" }, \
- -      { EXIT_REASON_VMCLEAR,               "VMCLEAR" }, \
- -      { EXIT_REASON_VMLAUNCH,              "VMLAUNCH" }, \
- -      { EXIT_REASON_VMPTRLD,               "VMPTRLD" }, \
- -      { EXIT_REASON_VMPTRST,               "VMPTRST" }, \
- -      { EXIT_REASON_VMREAD,                "VMREAD" }, \
- -      { EXIT_REASON_VMRESUME,              "VMRESUME" }, \
- -      { EXIT_REASON_VMWRITE,               "VMWRITE" }, \
- -      { EXIT_REASON_VMOFF,                 "VMOFF" }, \
- -      { EXIT_REASON_VMON,                  "VMON" }, \
- -      { EXIT_REASON_CR_ACCESS,             "CR_ACCESS" }, \
- -      { EXIT_REASON_DR_ACCESS,             "DR_ACCESS" }, \
- -      { EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
- -      { EXIT_REASON_MSR_READ,              "MSR_READ" }, \
- -      { EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
- -      { EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
- -      { EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
- -      { EXIT_REASON_PAUSE_INSTRUCTION,     "PAUSE_INSTRUCTION" }, \
- -      { EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
- -      { EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
- -      { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
- -      { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
- -      { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
- -      { EXIT_REASON_WBINVD,                "WBINVD" }, \
- -      { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
- -      { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
- -      { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
- -      { EXIT_REASON_INVD,                  "INVD" }, \
- -      { EXIT_REASON_INVPCID,               "INVPCID" }
- -
- -#ifdef __KERNEL__
   
   #include <linux/types.h>
+ +#include <uapi/asm/vmx.h>
   
   /*
    * Definitions of Primary Processor-Based VM-Execution Controls.
@@@ -57,9 -145,12 +57,12 @@@
   #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
   #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
   #define SECONDARY_EXEC_RDTSCP                 0x00000008
+ #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010
   #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
   #define SECONDARY_EXEC_WBINVD_EXITING         0x00000040
   #define SECONDARY_EXEC_UNRESTRICTED_GUEST     0x00000080
+ #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
+ #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
   #define SECONDARY_EXEC_PAUSE_LOOP_EXITING     0x00000400
   #define SECONDARY_EXEC_ENABLE_INVPCID         0x00001000
   
@@@ -97,6 -188,7 +100,7 @@@ enum vmcs_field 
         GUEST_GS_SELECTOR               = 0x0000080a,
         GUEST_LDTR_SELECTOR             = 0x0000080c,
         GUEST_TR_SELECTOR               = 0x0000080e,
+       GUEST_INTR_STATUS               = 0x00000810,
         HOST_ES_SELECTOR                = 0x00000c00,
         HOST_CS_SELECTOR                = 0x00000c02,
         HOST_SS_SELECTOR                = 0x00000c04,
@@@ -124,6 -216,14 +128,14 @@@
         APIC_ACCESS_ADDR_HIGH           = 0x00002015,
         EPT_POINTER                     = 0x0000201a,
         EPT_POINTER_HIGH                = 0x0000201b,
+       EOI_EXIT_BITMAP0                = 0x0000201c,
+       EOI_EXIT_BITMAP0_HIGH           = 0x0000201d,
+       EOI_EXIT_BITMAP1                = 0x0000201e,
+       EOI_EXIT_BITMAP1_HIGH           = 0x0000201f,
+       EOI_EXIT_BITMAP2                = 0x00002020,
+       EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
+       EOI_EXIT_BITMAP3                = 0x00002022,
+       EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
         GUEST_PHYSICAL_ADDRESS          = 0x00002400,
         GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
         VMCS_LINK_POINTER               = 0x00002800,
@@@ -346,9 -446,9 +358,9 @@@
   
   #define AR_RESERVD_MASK 0xfffe0f00
   
- #define TSS_PRIVATE_MEMSLOT                   (KVM_MEMORY_SLOTS + 0)
- #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT      (KVM_MEMORY_SLOTS + 1)
- #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT    (KVM_MEMORY_SLOTS + 2)
+ #define TSS_PRIVATE_MEMSLOT                   (KVM_USER_MEM_SLOTS + 0)
+ #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT      (KVM_USER_MEM_SLOTS + 1)
+ #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT    (KVM_USER_MEM_SLOTS + 2)
   
   #define VMX_NR_VPIDS                          (1 << 16)
   #define VMX_VPID_EXTENT_SINGLE_CONTEXT                1
@@@ -445,3 -545,5 +457,3 @@@ enum vm_instruction_error_number 
   };
   
   #endif
- -
- -#endif
diff --combined arch/x86/include/uapi/asm/vmx.h

index 979d03bce135df3fe9bd9dc516cfaf449c22ea5e,0000000000000000000000000000000000000000..2871fccfee68619896f03d50ab4b93f75eaad8e3

mode 100644,000000..100644
--- 1/arch/x86/include/uapi/asm/vmx.h
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vmx.h
@@@ -1,109 -1,0 +1,116 @@@
-       { EXIT_REASON_WBINVD,                "WBINVD" }
+ +/*
+ + * vmx.h: VMX Architecture related definitions
+ + * Copyright (c) 2004, Intel Corporation.
+ + *
+ + * This program is free software; you can redistribute it and/or modify it
+ + * under the terms and conditions of the GNU General Public License,
+ + * version 2, as published by the Free Software Foundation.
+ + *
+ + * This program is distributed in the hope it will be useful, but WITHOUT
+ + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ + * more details.
+ + *
+ + * You should have received a copy of the GNU General Public License along with
+ + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ + * Place - Suite 330, Boston, MA 02111-1307 USA.
+ + *
+ + * A few random additions are:
+ + * Copyright (C) 2006 Qumranet
+ + *    Avi Kivity <avi@qumranet.com>
+ + *    Yaniv Kamay <yaniv@qumranet.com>
+ + *
+ + */
+ +#ifndef _UAPIVMX_H
+ +#define _UAPIVMX_H
+ +
+ +
+ +#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+ +
+ +#define EXIT_REASON_EXCEPTION_NMI       0
+ +#define EXIT_REASON_EXTERNAL_INTERRUPT  1
+ +#define EXIT_REASON_TRIPLE_FAULT        2
+ +
+ +#define EXIT_REASON_PENDING_INTERRUPT   7
+ +#define EXIT_REASON_NMI_WINDOW          8
+ +#define EXIT_REASON_TASK_SWITCH         9
+ +#define EXIT_REASON_CPUID               10
+ +#define EXIT_REASON_HLT                 12
+ +#define EXIT_REASON_INVD                13
+ +#define EXIT_REASON_INVLPG              14
+ +#define EXIT_REASON_RDPMC               15
+ +#define EXIT_REASON_RDTSC               16
+ +#define EXIT_REASON_VMCALL              18
+ +#define EXIT_REASON_VMCLEAR             19
+ +#define EXIT_REASON_VMLAUNCH            20
+ +#define EXIT_REASON_VMPTRLD             21
+ +#define EXIT_REASON_VMPTRST             22
+ +#define EXIT_REASON_VMREAD              23
+ +#define EXIT_REASON_VMRESUME            24
+ +#define EXIT_REASON_VMWRITE             25
+ +#define EXIT_REASON_VMOFF               26
+ +#define EXIT_REASON_VMON                27
+ +#define EXIT_REASON_CR_ACCESS           28
+ +#define EXIT_REASON_DR_ACCESS           29
+ +#define EXIT_REASON_IO_INSTRUCTION      30
+ +#define EXIT_REASON_MSR_READ            31
+ +#define EXIT_REASON_MSR_WRITE           32
+ +#define EXIT_REASON_INVALID_STATE       33
+ +#define EXIT_REASON_MWAIT_INSTRUCTION   36
+ +#define EXIT_REASON_MONITOR_INSTRUCTION 39
+ +#define EXIT_REASON_PAUSE_INSTRUCTION   40
+ +#define EXIT_REASON_MCE_DURING_VMENTRY  41
+ +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+ +#define EXIT_REASON_APIC_ACCESS         44
++#define EXIT_REASON_EOI_INDUCED         45
+ +#define EXIT_REASON_EPT_VIOLATION       48
+ +#define EXIT_REASON_EPT_MISCONFIG       49
+ +#define EXIT_REASON_WBINVD              54
+ +#define EXIT_REASON_XSETBV              55
++#define EXIT_REASON_APIC_WRITE          56
+ +#define EXIT_REASON_INVPCID             58
+ +
+ +#define VMX_EXIT_REASONS \
+ +      { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
+ +      { EXIT_REASON_EXTERNAL_INTERRUPT,    "EXTERNAL_INTERRUPT" }, \
+ +      { EXIT_REASON_TRIPLE_FAULT,          "TRIPLE_FAULT" }, \
+ +      { EXIT_REASON_PENDING_INTERRUPT,     "PENDING_INTERRUPT" }, \
+ +      { EXIT_REASON_NMI_WINDOW,            "NMI_WINDOW" }, \
+ +      { EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
+ +      { EXIT_REASON_CPUID,                 "CPUID" }, \
+ +      { EXIT_REASON_HLT,                   "HLT" }, \
+ +      { EXIT_REASON_INVLPG,                "INVLPG" }, \
+ +      { EXIT_REASON_RDPMC,                 "RDPMC" }, \
+ +      { EXIT_REASON_RDTSC,                 "RDTSC" }, \
+ +      { EXIT_REASON_VMCALL,                "VMCALL" }, \
+ +      { EXIT_REASON_VMCLEAR,               "VMCLEAR" }, \
+ +      { EXIT_REASON_VMLAUNCH,              "VMLAUNCH" }, \
+ +      { EXIT_REASON_VMPTRLD,               "VMPTRLD" }, \
+ +      { EXIT_REASON_VMPTRST,               "VMPTRST" }, \
+ +      { EXIT_REASON_VMREAD,                "VMREAD" }, \
+ +      { EXIT_REASON_VMRESUME,              "VMRESUME" }, \
+ +      { EXIT_REASON_VMWRITE,               "VMWRITE" }, \
+ +      { EXIT_REASON_VMOFF,                 "VMOFF" }, \
+ +      { EXIT_REASON_VMON,                  "VMON" }, \
+ +      { EXIT_REASON_CR_ACCESS,             "CR_ACCESS" }, \
+ +      { EXIT_REASON_DR_ACCESS,             "DR_ACCESS" }, \
+ +      { EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
+ +      { EXIT_REASON_MSR_READ,              "MSR_READ" }, \
+ +      { EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
+ +      { EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
+ +      { EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
+ +      { EXIT_REASON_PAUSE_INSTRUCTION,     "PAUSE_INSTRUCTION" }, \
+ +      { EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
+ +      { EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
+ +      { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
+ +      { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
+ +      { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
++      { EXIT_REASON_WBINVD,                "WBINVD" }, \
++      { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
++      { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
++      { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
++      { EXIT_REASON_INVD,                  "INVD" }, \
++      { EXIT_REASON_INVPCID,               "INVPCID" }
+ +
+ +
+ +#endif /* _UAPIVMX_H */
diff --combined arch/x86/kernel/kvmclock.c

index 9f966dc0b9e4aaf9fc882687dd11d8f64682e299,5bedbdddf1f2e3383f8a6fc82f81cdb2ecdcd1ad..0732f0089a3df2d0bcbde6b397fc8c3e1e76844c
--- 1/arch/x86/kernel/kvmclock.c
--- 2/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@@ -162,8 -162,8 +162,8 @@@ int kvm_register_clock(char *txt
         int low, high, ret;
         struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
   
- -      low = (int)__pa(src) | 1;
- -      high = ((u64)__pa(src) >> 32);
+ +      low = (int)slow_virt_to_phys(src) | 1;
+ +      high = ((u64)slow_virt_to_phys(src) >> 32);
         ret = native_write_msr_safe(msr_kvm_system_time, low, high);
         printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
                cpu, high, low, txt);
@@@ -218,6 -218,9 +218,9 @@@ static void kvm_shutdown(void
   void __init kvmclock_init(void)
   {
         unsigned long mem;
+       int size;
+ 
+       size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
   
         if (!kvm_para_available())
                 return;
@@@ -231,16 -234,14 +234,14 @@@
         printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
                 msr_kvm_system_time, msr_kvm_wall_clock);
   
-       mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
-                            PAGE_SIZE);
+       mem = memblock_alloc(size, PAGE_SIZE);
         if (!mem)
                 return;
         hv_clock = __va(mem);
   
         if (kvm_register_clock("boot clock")) {
                 hv_clock = NULL;
-               memblock_free(mem,
-                       sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
+               memblock_free(mem, size);
                 return;
         }
         pv_time_ops.sched_clock = kvm_clock_read;
@@@ -275,7 -276,7 +276,7 @@@ int __init kvm_setup_vsyscall_timeinfo(
         struct pvclock_vcpu_time_info *vcpu_time;
         unsigned int size;
   
-       size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
+       size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
   
         preempt_disable();
         cpu = smp_processor_id();
diff --combined arch/x86/kvm/x86.c

index 37040079cd6bec44276a4161b84e120fe923293f,3c5bb6fe52804c59c31c7304757fa2aa92dc0d1b..f71500af1f813245bb12092665ac7dea3ba5f24f
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -120,7 -120,7 +120,7 @@@ struct kvm_shared_msrs 
   };
   
   static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
- -static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+ +static struct kvm_shared_msrs __percpu *shared_msrs;
   
   struct kvm_stats_debugfs_item debugfs_entries[] = {
         { "pf_fixed", VCPU_STAT(pf_fixed) },
@@@ -191,10 -191,10 +191,10 @@@ static void kvm_on_user_return(struct u
   
   static void shared_msr_update(unsigned slot, u32 msr)
   {
- -      struct kvm_shared_msrs *smsr;
         u64 value;
+ +      unsigned int cpu = smp_processor_id();
+ +      struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
   
- -      smsr = &__get_cpu_var(shared_msrs);
         /* only read, and nobody should modify it at this time,
          * so don't need lock */
         if (slot >= shared_msrs_global.nr) {
@@@ -226,8 -226,7 +226,8 @@@ static void kvm_shared_msr_cpu_online(v
   
   void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
   {
- -      struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+ +      unsigned int cpu = smp_processor_id();
+ +      struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
   
         if (((value ^ smsr->values[slot].curr) & mask) == 0)
                 return;
@@@ -243,8 -242,7 +243,8 @@@ EXPORT_SYMBOL_GPL(kvm_set_shared_msr)
   
   static void drop_user_return_notifiers(void *ignore)
   {
- -      struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+ +      unsigned int cpu = smp_processor_id();
+ +      struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
   
         if (smsr->registered)
                 kvm_on_user_return(&smsr->urn);
@@@ -872,8 -870,6 +872,6 @@@ static int set_efer(struct kvm_vcpu *vc
   
         kvm_x86_ops->set_efer(vcpu, efer);
   
-       vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
- 
         /* Update reserved bits */
         if ((efer ^ old_efer) & EFER_NX)
                 kvm_mmu_reset_context(vcpu);
@@@ -1881,14 -1877,6 +1879,14 @@@ int kvm_set_msr_common(struct kvm_vcpu 
         u64 data = msr_info->data;
   
         switch (msr) {
+ +      case MSR_AMD64_NB_CFG:
+ +      case MSR_IA32_UCODE_REV:
+ +      case MSR_IA32_UCODE_WRITE:
+ +      case MSR_VM_HSAVE_PA:
+ +      case MSR_AMD64_PATCH_LOADER:
+ +      case MSR_AMD64_BU_CFG2:
+ +              break;
+ +
         case MSR_EFER:
                 return set_efer(vcpu, data);
         case MSR_K7_HWCR:
@@@ -1908,6 -1896,8 +1906,6 @@@
                         return 1;
                 }
                 break;
- -      case MSR_AMD64_NB_CFG:
- -              break;
         case MSR_IA32_DEBUGCTLMSR:
                 if (!data) {
                         /* We support the non-activated case already */
@@@ -1920,6 -1910,11 +1918,6 @@@
                 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
                             __func__, data);
                 break;
- -      case MSR_IA32_UCODE_REV:
- -      case MSR_IA32_UCODE_WRITE:
- -      case MSR_VM_HSAVE_PA:
- -      case MSR_AMD64_PATCH_LOADER:
- -              break;
         case 0x200 ... 0x2ff:
                 return set_msr_mtrr(vcpu, msr, data);
         case MSR_IA32_APICBASE:
@@@ -2254,7 -2249,6 +2252,7 @@@ int kvm_get_msr_common(struct kvm_vcpu 
         case MSR_K8_INT_PENDING_MSG:
         case MSR_AMD64_NB_CFG:
         case MSR_FAM10H_MMIO_CONF_BASE:
+ +      case MSR_AMD64_BU_CFG2:
                 data = 0;
                 break;
         case MSR_P6_PERFCTR0:
@@@ -2522,7 -2516,7 +2520,7 @@@ int kvm_dev_ioctl_check_extension(long 
                 r = KVM_MAX_VCPUS;
                 break;
         case KVM_CAP_NR_MEMSLOTS:
-               r = KVM_MEMORY_SLOTS;
+               r = KVM_USER_MEM_SLOTS;
                 break;
         case KVM_CAP_PV_MMU:    /* obsolete */
                 r = 0;
@@@ -3274,12 -3268,10 +3272,10 @@@ static int kvm_vm_ioctl_set_nr_mmu_page
                 return -EINVAL;
   
         mutex_lock(&kvm->slots_lock);
-       spin_lock(&kvm->mmu_lock);
   
         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
   
-       spin_unlock(&kvm->mmu_lock);
         mutex_unlock(&kvm->slots_lock);
         return 0;
   }
@@@ -3439,7 -3431,7 +3435,7 @@@ int kvm_vm_ioctl_get_dirty_log(struct k
         mutex_lock(&kvm->slots_lock);
   
         r = -EINVAL;
-       if (log->slot >= KVM_MEMORY_SLOTS)
+       if (log->slot >= KVM_USER_MEM_SLOTS)
                 goto out;
   
         memslot = id_to_memslot(kvm->memslots, log->slot);
@@@ -4495,8 -4487,10 +4491,10 @@@ static bool emulator_get_segment(struc
         kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
         *selector = var.selector;
   
-       if (var.unusable)
+       if (var.unusable) {
+               memset(desc, 0, sizeof(*desc));
                 return false;
+       }
   
         if (var.g)
                 var.limit >>= 12;
@@@ -4757,26 -4751,26 +4755,26 @@@ static int handle_emulation_failure(str
         return r;
   }
   
- static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+                                 bool write_fault_to_shadow_pgtable)
   {
-       gpa_t gpa;
+       gpa_t gpa = cr2;
         pfn_t pfn;
   
-       if (tdp_enabled)
-               return false;
- 
-       /*
-        * if emulation was due to access to shadowed page table
-        * and it failed try to unshadow page and re-enter the
-        * guest to let CPU execute the instruction.
-        */
-       if (kvm_mmu_unprotect_page_virt(vcpu, gva))
-               return true;
- 
-       gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+       if (!vcpu->arch.mmu.direct_map) {
+               /*
+                * Write permission should be allowed since only
+                * write access need to be emulated.
+                */
+               gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
   
-       if (gpa == UNMAPPED_GVA)
-               return true; /* let cpu generate fault */
+               /*
+                * If the mapping is invalid in guest, let cpu retry
+                * it to generate fault.
+                */
+               if (gpa == UNMAPPED_GVA)
+                       return true;
+       }
   
         /*
          * Do not retry the unhandleable instruction if it faults on the
@@@ -4785,12 -4779,43 +4783,43 @@@
          * instruction -> ...
          */
         pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-       if (!is_error_noslot_pfn(pfn)) {
-               kvm_release_pfn_clean(pfn);
+ 
+       /*
+        * If the instruction failed on the error pfn, it can not be fixed,
+        * report the error to userspace.
+        */
+       if (is_error_noslot_pfn(pfn))
+               return false;
+ 
+       kvm_release_pfn_clean(pfn);
+ 
+       /* The instructions are well-emulated on direct mmu. */
+       if (vcpu->arch.mmu.direct_map) {
+               unsigned int indirect_shadow_pages;
+ 
+               spin_lock(&vcpu->kvm->mmu_lock);
+               indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+               spin_unlock(&vcpu->kvm->mmu_lock);
+ 
+               if (indirect_shadow_pages)
+                       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ 
                 return true;
         }
   
-       return false;
+       /*
+        * if emulation was due to access to shadowed page table
+        * and it failed try to unshadow page and re-enter the
+        * guest to let CPU execute the instruction.
+        */
+       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ 
+       /*
+        * If the access faults on its page table, it can not
+        * be fixed by unprotecting shadow page and it should
+        * be reported to userspace.
+        */
+       return !write_fault_to_shadow_pgtable;
   }
   
   static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@@ -4832,7 -4857,7 +4861,7 @@@
         if (!vcpu->arch.mmu.direct_map)
                 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
   
-       kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
   
         return true;
   }
@@@ -4849,7 -4874,13 +4878,13 @@@ int x86_emulate_instruction(struct kvm_
         int r;
         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
         bool writeback = true;
+       bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
   
+       /*
+        * Clear write_fault_to_shadow_pgtable here to ensure it is
+        * never reused.
+        */
+       vcpu->arch.write_fault_to_shadow_pgtable = false;
         kvm_clear_exception_queue(vcpu);
   
         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@@ -4868,7 -4899,8 +4903,8 @@@
                 if (r != EMULATION_OK)  {
                         if (emulation_type & EMULTYPE_TRAP_UD)
                                 return EMULATE_FAIL;
-                       if (reexecute_instruction(vcpu, cr2))
+                       if (reexecute_instruction(vcpu, cr2,
+                                                 write_fault_to_spt))
                                 return EMULATE_DONE;
                         if (emulation_type & EMULTYPE_SKIP)
                                 return EMULATE_FAIL;
@@@ -4898,7 -4930,7 +4934,7 @@@ restart
                 return EMULATE_DONE;
   
         if (r == EMULATION_FAILED) {
-               if (reexecute_instruction(vcpu, cr2))
+               if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
                         return EMULATE_DONE;
   
                 return handle_emulation_failure(vcpu);
@@@ -5237,16 -5269,9 +5273,16 @@@ int kvm_arch_init(void *opaque
                 goto out;
         }
   
+ +      r = -ENOMEM;
+ +      shared_msrs = alloc_percpu(struct kvm_shared_msrs);
+ +      if (!shared_msrs) {
+ +              printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ +              goto out;
+ +      }
+ +
         r = kvm_mmu_module_init();
         if (r)
- -              goto out;
+ +              goto out_free_percpu;
   
         kvm_set_mmio_spte_mask();
         kvm_init_msr_list();
@@@ -5269,8 -5294,6 +5305,8 @@@
   
         return 0;
   
+ +out_free_percpu:
+ +      free_percpu(shared_msrs);
   out:
         return r;
   }
@@@ -5288,7 -5311,6 +5324,7 @@@ void kvm_arch_exit(void
   #endif
         kvm_x86_ops = NULL;
         kvm_mmu_module_exit();
+ +      free_percpu(shared_msrs);
   }
   
   int kvm_emulate_halt(struct kvm_vcpu *vcpu)
@@@ -5541,7 -5563,7 +5577,7 @@@ static void inject_pending_event(struc
                         vcpu->arch.nmi_injected = true;
                         kvm_x86_ops->set_nmi(vcpu);
                 }
-       } else if (kvm_cpu_has_interrupt(vcpu)) {
+       } else if (kvm_cpu_has_injectable_intr(vcpu)) {
                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
                                             false);
@@@ -5609,6 -5631,16 +5645,16 @@@ static void kvm_gen_update_masterclock(
   #endif
   }
   
+ static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+ {
+       u64 eoi_exit_bitmap[4];
+ 
+       memset(eoi_exit_bitmap, 0, 32);
+ 
+       kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+       kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ }
+ 
   static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
   {
         int r;
@@@ -5662,6 -5694,8 +5708,8 @@@
                         kvm_handle_pmu_event(vcpu);
                 if (kvm_check_request(KVM_REQ_PMI, vcpu))
                         kvm_deliver_pmi(vcpu);
+               if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
+                       update_eoi_exitmap(vcpu);
         }
   
         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@@ -5670,10 -5704,17 +5718,17 @@@
                 /* enable NMI/IRQ window open exits if needed */
                 if (vcpu->arch.nmi_pending)
                         kvm_x86_ops->enable_nmi_window(vcpu);
-               else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+               else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
                         kvm_x86_ops->enable_irq_window(vcpu);
   
                 if (kvm_lapic_enabled(vcpu)) {
+                       /*
+                        * Update architecture specific hints for APIC
+                        * virtual interrupt delivery.
+                        */
+                       if (kvm_x86_ops->hwapic_irr_update)
+                               kvm_x86_ops->hwapic_irr_update(vcpu,
+                                       kvm_lapic_find_highest_irr(vcpu));
                         update_cr8_intercept(vcpu);
                         kvm_lapic_sync_to_vapic(vcpu);
                 }
@@@ -6853,48 -6894,43 +6908,43 @@@ int kvm_arch_prepare_memory_region(stru
                                 struct kvm_memory_slot *memslot,
                                 struct kvm_memory_slot old,
                                 struct kvm_userspace_memory_region *mem,
-                               int user_alloc)
+                               bool user_alloc)
   {
         int npages = memslot->npages;
-       int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
   
-       /* Prevent internal slot pages from being moved by fork()/COW. */
-       if (memslot->id >= KVM_MEMORY_SLOTS)
-               map_flags = MAP_SHARED | MAP_ANONYMOUS;
- 
-       /*To keep backward compatibility with older userspace,
-        *x86 needs to handle !user_alloc case.
+       /*
+        * Only private memory slots need to be mapped here since
+        * KVM_SET_MEMORY_REGION ioctl is no longer supported.
          */
-       if (!user_alloc) {
-               if (npages && !old.npages) {
-                       unsigned long userspace_addr;
+       if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+               unsigned long userspace_addr;
   
-                       userspace_addr = vm_mmap(NULL, 0,
-                                                npages * PAGE_SIZE,
-                                                PROT_READ | PROT_WRITE,
-                                                map_flags,
-                                                0);
+               /*
+                * MAP_SHARED to prevent internal slot pages from being moved
+                * by fork()/COW.
+                */
+               userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+                                        PROT_READ | PROT_WRITE,
+                                        MAP_SHARED | MAP_ANONYMOUS, 0);
   
-                       if (IS_ERR((void *)userspace_addr))
-                               return PTR_ERR((void *)userspace_addr);
+               if (IS_ERR((void *)userspace_addr))
+                       return PTR_ERR((void *)userspace_addr);
   
-                       memslot->userspace_addr = userspace_addr;
-               }
+               memslot->userspace_addr = userspace_addr;
         }
   
- 
         return 0;
   }
   
   void kvm_arch_commit_memory_region(struct kvm *kvm,
                                 struct kvm_userspace_memory_region *mem,
                                 struct kvm_memory_slot old,
-                               int user_alloc)
+                               bool user_alloc)
   {
   
         int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
   
-       if (!user_alloc && !old.user_alloc && old.npages && !npages) {
+       if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
                 int ret;
   
                 ret = vm_munmap(old.userspace_addr,
@@@ -6908,11 -6944,15 +6958,15 @@@
         if (!kvm->arch.n_requested_mmu_pages)
                 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
   
-       spin_lock(&kvm->mmu_lock);
         if (nr_mmu_pages)
                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
-       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-       spin_unlock(&kvm->mmu_lock);
+       /*
+        * Write protect all pages for dirty logging.
+        * Existing largepage mappings are destroyed here and new ones will
+        * not be created until the end of the logging.
+        */
+       if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+               kvm_mmu_slot_remove_write_access(kvm, mem->slot);
         /*
          * If memory slot is created, or moved, we need to clear all
          * mmio sptes.
diff --combined drivers/s390/kvm/kvm_virtio.c

index 8491111aec12d45b780243a907f71bb86adbfe17,b846b6c4130ab6b292c672cf7915e9002752ed39..03a15e016778c60b246b40ae173fd6dd9866935f
--- 1/drivers/s390/kvm/kvm_virtio.c
--- 2/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@@ -392,7 -392,7 +392,7 @@@ static void kvm_extint_handler(struct e
   
         if ((ext_code.subcode & 0xff00) != VIRTIO_SUBCODE_64)
                 return;
- -      kstat_cpu(smp_processor_id()).irqs[EXTINT_VRT]++;
+ +      inc_irq_stat(IRQEXT_VRT);
   
         /* The LSB might be overloaded, we have to mask it */
         vq = (struct virtqueue *)(param64 & ~1UL);
@@@ -421,6 -421,26 +421,26 @@@
         }
   }
   
+ /*
+  * For s390-virtio, we expect a page above main storage containing
+  * the virtio configuration. Try to actually load from this area
+  * in order to figure out if the host provides this page.
+  */
+ static int __init test_devices_support(unsigned long addr)
+ {
+       int ret = -EIO;
+ 
+       asm volatile(
+               "0:     lura    0,%1\n"
+               "1:     xgr     %0,%0\n"
+               "2:\n"
+               EX_TABLE(0b,2b)
+               EX_TABLE(1b,2b)
+               : "+d" (ret)
+               : "a" (addr)
+               : "0", "cc");
+       return ret;
+ }
   /*
    * Init function for virtio
    * devices are in a single page above top of "normal" mem
@@@ -432,21 -452,23 +452,23 @@@ static int __init kvm_devices_init(void
         if (!MACHINE_IS_KVM)
                 return -ENODEV;
   
+       if (test_devices_support(real_memory_size) < 0)
+               return -ENODEV;
+ 
+       rc = vmem_add_mapping(real_memory_size, PAGE_SIZE);
+       if (rc)
+               return rc;
+ 
+       kvm_devices = (void *) real_memory_size;
+ 
         kvm_root = root_device_register("kvm_s390");
         if (IS_ERR(kvm_root)) {
                 rc = PTR_ERR(kvm_root);
                 printk(KERN_ERR "Could not register kvm_s390 root device");
+               vmem_remove_mapping(real_memory_size, PAGE_SIZE);
                 return rc;
         }
   
-       rc = vmem_add_mapping(real_memory_size, PAGE_SIZE);
-       if (rc) {
-               root_device_unregister(kvm_root);
-               return rc;
-       }
- 
-       kvm_devices = (void *) real_memory_size;
- 
         INIT_WORK(&hotplug_work, hotplug_devices);
   
         service_subclass_irq_register();
diff --combined drivers/s390/kvm/virtio_ccw.c

index 0000000000000000000000000000000000000000,3217dfe5cb8b24d71029d5459da6c5886a6ba6ef..2029b6caa5956c2b4a01044b83325a0de16e3104

mode 000000,100644..100644
--- /dev/null
--- 2/drivers/s390/kvm/virtio_ccw.c
+++ b/drivers/s390/kvm/virtio_ccw.c
@@@ -1,0 -1,926 +1,926 @@@
- -      .int_class = IOINT_VIR,
+ /*
+  * ccw based virtio transport
+  *
+  * Copyright IBM Corp. 2012
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License (version 2 only)
+  * as published by the Free Software Foundation.
+  *
+  *    Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
+  */
+ 
+ #include <linux/kernel_stat.h>
+ #include <linux/init.h>
+ #include <linux/bootmem.h>
+ #include <linux/err.h>
+ #include <linux/virtio.h>
+ #include <linux/virtio_config.h>
+ #include <linux/slab.h>
+ #include <linux/interrupt.h>
+ #include <linux/virtio_ring.h>
+ #include <linux/pfn.h>
+ #include <linux/async.h>
+ #include <linux/wait.h>
+ #include <linux/list.h>
+ #include <linux/bitops.h>
+ #include <linux/module.h>
+ #include <linux/io.h>
+ #include <linux/kvm_para.h>
+ #include <asm/setup.h>
+ #include <asm/irq.h>
+ #include <asm/cio.h>
+ #include <asm/ccwdev.h>
+ 
+ /*
+  * virtio related functions
+  */
+ 
+ struct vq_config_block {
+       __u16 index;
+       __u16 num;
+ } __packed;
+ 
+ #define VIRTIO_CCW_CONFIG_SIZE 0x100
+ /* same as PCI config space size, should be enough for all drivers */
+ 
+ struct virtio_ccw_device {
+       struct virtio_device vdev;
+       __u8 *status;
+       __u8 config[VIRTIO_CCW_CONFIG_SIZE];
+       struct ccw_device *cdev;
+       __u32 curr_io;
+       int err;
+       wait_queue_head_t wait_q;
+       spinlock_t lock;
+       struct list_head virtqueues;
+       unsigned long indicators;
+       unsigned long indicators2;
+       struct vq_config_block *config_block;
+ };
+ 
+ struct vq_info_block {
+       __u64 queue;
+       __u32 align;
+       __u16 index;
+       __u16 num;
+ } __packed;
+ 
+ struct virtio_feature_desc {
+       __u32 features;
+       __u8 index;
+ } __packed;
+ 
+ struct virtio_ccw_vq_info {
+       struct virtqueue *vq;
+       int num;
+       void *queue;
+       struct vq_info_block *info_block;
+       struct list_head node;
+ };
+ 
+ #define KVM_VIRTIO_CCW_RING_ALIGN 4096
+ 
+ #define KVM_S390_VIRTIO_CCW_NOTIFY 3
+ 
+ #define CCW_CMD_SET_VQ 0x13
+ #define CCW_CMD_VDEV_RESET 0x33
+ #define CCW_CMD_SET_IND 0x43
+ #define CCW_CMD_SET_CONF_IND 0x53
+ #define CCW_CMD_READ_FEAT 0x12
+ #define CCW_CMD_WRITE_FEAT 0x11
+ #define CCW_CMD_READ_CONF 0x22
+ #define CCW_CMD_WRITE_CONF 0x21
+ #define CCW_CMD_WRITE_STATUS 0x31
+ #define CCW_CMD_READ_VQ_CONF 0x32
+ 
+ #define VIRTIO_CCW_DOING_SET_VQ 0x00010000
+ #define VIRTIO_CCW_DOING_RESET 0x00040000
+ #define VIRTIO_CCW_DOING_READ_FEAT 0x00080000
+ #define VIRTIO_CCW_DOING_WRITE_FEAT 0x00100000
+ #define VIRTIO_CCW_DOING_READ_CONFIG 0x00200000
+ #define VIRTIO_CCW_DOING_WRITE_CONFIG 0x00400000
+ #define VIRTIO_CCW_DOING_WRITE_STATUS 0x00800000
+ #define VIRTIO_CCW_DOING_SET_IND 0x01000000
+ #define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000
+ #define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000
+ #define VIRTIO_CCW_INTPARM_MASK 0xffff0000
+ 
+ static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
+ {
+       return container_of(vdev, struct virtio_ccw_device, vdev);
+ }
+ 
+ static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag)
+ {
+       unsigned long flags;
+       __u32 ret;
+ 
+       spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
+       if (vcdev->err)
+               ret = 0;
+       else
+               ret = vcdev->curr_io & flag;
+       spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
+       return ret;
+ }
+ 
+ static int ccw_io_helper(struct virtio_ccw_device *vcdev,
+                        struct ccw1 *ccw, __u32 intparm)
+ {
+       int ret;
+       unsigned long flags;
+       int flag = intparm & VIRTIO_CCW_INTPARM_MASK;
+ 
+       do {
+               spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
+               ret = ccw_device_start(vcdev->cdev, ccw, intparm, 0, 0);
+               if (!ret)
+                       vcdev->curr_io |= flag;
+               spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
+               cpu_relax();
+       } while (ret == -EBUSY);
+       wait_event(vcdev->wait_q, doing_io(vcdev, flag) == 0);
+       return ret ? ret : vcdev->err;
+ }
+ 
+ static inline long do_kvm_notify(struct subchannel_id schid,
+                                unsigned long queue_index)
+ {
+       register unsigned long __nr asm("1") = KVM_S390_VIRTIO_CCW_NOTIFY;
+       register struct subchannel_id __schid asm("2") = schid;
+       register unsigned long __index asm("3") = queue_index;
+       register long __rc asm("2");
+ 
+       asm volatile ("diag 2,4,0x500\n"
+                     : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index)
+                     : "memory", "cc");
+       return __rc;
+ }
+ 
+ static void virtio_ccw_kvm_notify(struct virtqueue *vq)
+ {
+       struct virtio_ccw_vq_info *info = vq->priv;
+       struct virtio_ccw_device *vcdev;
+       struct subchannel_id schid;
+ 
+       vcdev = to_vc_device(info->vq->vdev);
+       ccw_device_get_schid(vcdev->cdev, &schid);
+       do_kvm_notify(schid, virtqueue_get_queue_index(vq));
+ }
+ 
+ static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
+                                  struct ccw1 *ccw, int index)
+ {
+       vcdev->config_block->index = index;
+       ccw->cmd_code = CCW_CMD_READ_VQ_CONF;
+       ccw->flags = 0;
+       ccw->count = sizeof(struct vq_config_block);
+       ccw->cda = (__u32)(unsigned long)(vcdev->config_block);
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_VQ_CONF);
+       return vcdev->config_block->num;
+ }
+ 
+ static void virtio_ccw_del_vq(struct virtqueue *vq, struct ccw1 *ccw)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vq->vdev);
+       struct virtio_ccw_vq_info *info = vq->priv;
+       unsigned long flags;
+       unsigned long size;
+       int ret;
+       unsigned int index = virtqueue_get_queue_index(vq);
+ 
+       /* Remove from our list. */
+       spin_lock_irqsave(&vcdev->lock, flags);
+       list_del(&info->node);
+       spin_unlock_irqrestore(&vcdev->lock, flags);
+ 
+       /* Release from host. */
+       info->info_block->queue = 0;
+       info->info_block->align = 0;
+       info->info_block->index = index;
+       info->info_block->num = 0;
+       ccw->cmd_code = CCW_CMD_SET_VQ;
+       ccw->flags = 0;
+       ccw->count = sizeof(*info->info_block);
+       ccw->cda = (__u32)(unsigned long)(info->info_block);
+       ret = ccw_io_helper(vcdev, ccw,
+                           VIRTIO_CCW_DOING_SET_VQ | index);
+       /*
+        * -ENODEV isn't considered an error: The device is gone anyway.
+        * This may happen on device detach.
+        */
+       if (ret && (ret != -ENODEV))
+               dev_warn(&vq->vdev->dev, "Error %d while deleting queue %d",
+                        ret, index);
+ 
+       vring_del_virtqueue(vq);
+       size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
+       free_pages_exact(info->queue, size);
+       kfree(info->info_block);
+       kfree(info);
+ }
+ 
+ static void virtio_ccw_del_vqs(struct virtio_device *vdev)
+ {
+       struct virtqueue *vq, *n;
+       struct ccw1 *ccw;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+ 
+ 
+       list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+               virtio_ccw_del_vq(vq, ccw);
+ 
+       kfree(ccw);
+ }
+ 
+ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
+                                            int i, vq_callback_t *callback,
+                                            const char *name,
+                                            struct ccw1 *ccw)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       int err;
+       struct virtqueue *vq = NULL;
+       struct virtio_ccw_vq_info *info;
+       unsigned long size = 0; /* silence the compiler */
+       unsigned long flags;
+ 
+       /* Allocate queue. */
+       info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL);
+       if (!info) {
+               dev_warn(&vcdev->cdev->dev, "no info\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+       info->info_block = kzalloc(sizeof(*info->info_block),
+                                  GFP_DMA | GFP_KERNEL);
+       if (!info->info_block) {
+               dev_warn(&vcdev->cdev->dev, "no info block\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+       info->num = virtio_ccw_read_vq_conf(vcdev, ccw, i);
+       size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
+       info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
+       if (info->queue == NULL) {
+               dev_warn(&vcdev->cdev->dev, "no queue\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+ 
+       vq = vring_new_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev,
+                                true, info->queue, virtio_ccw_kvm_notify,
+                                callback, name);
+       if (!vq) {
+               /* For now, we fail if we can't get the requested size. */
+               dev_warn(&vcdev->cdev->dev, "no vq\n");
+               err = -ENOMEM;
+               goto out_err;
+       }
+ 
+       /* Register it with the host. */
+       info->info_block->queue = (__u64)info->queue;
+       info->info_block->align = KVM_VIRTIO_CCW_RING_ALIGN;
+       info->info_block->index = i;
+       info->info_block->num = info->num;
+       ccw->cmd_code = CCW_CMD_SET_VQ;
+       ccw->flags = 0;
+       ccw->count = sizeof(*info->info_block);
+       ccw->cda = (__u32)(unsigned long)(info->info_block);
+       err = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_VQ | i);
+       if (err) {
+               dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n");
+               goto out_err;
+       }
+ 
+       info->vq = vq;
+       vq->priv = info;
+ 
+       /* Save it to our list. */
+       spin_lock_irqsave(&vcdev->lock, flags);
+       list_add(&info->node, &vcdev->virtqueues);
+       spin_unlock_irqrestore(&vcdev->lock, flags);
+ 
+       return vq;
+ 
+ out_err:
+       if (vq)
+               vring_del_virtqueue(vq);
+       if (info) {
+               if (info->queue)
+                       free_pages_exact(info->queue, size);
+               kfree(info->info_block);
+       }
+       kfree(info);
+       return ERR_PTR(err);
+ }
+ 
+ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+                              struct virtqueue *vqs[],
+                              vq_callback_t *callbacks[],
+                              const char *names[])
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       unsigned long *indicatorp = NULL;
+       int ret, i;
+       struct ccw1 *ccw;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return -ENOMEM;
+ 
+       for (i = 0; i < nvqs; ++i) {
+               vqs[i] = virtio_ccw_setup_vq(vdev, i, callbacks[i], names[i],
+                                            ccw);
+               if (IS_ERR(vqs[i])) {
+                       ret = PTR_ERR(vqs[i]);
+                       vqs[i] = NULL;
+                       goto out;
+               }
+       }
+       ret = -ENOMEM;
+       /* We need a data area under 2G to communicate. */
+       indicatorp = kmalloc(sizeof(&vcdev->indicators), GFP_DMA | GFP_KERNEL);
+       if (!indicatorp)
+               goto out;
+       *indicatorp = (unsigned long) &vcdev->indicators;
+       /* Register queue indicators with host. */
+       vcdev->indicators = 0;
+       ccw->cmd_code = CCW_CMD_SET_IND;
+       ccw->flags = 0;
+       ccw->count = sizeof(vcdev->indicators);
+       ccw->cda = (__u32)(unsigned long) indicatorp;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
+       if (ret)
+               goto out;
+       /* Register indicators2 with host for config changes */
+       *indicatorp = (unsigned long) &vcdev->indicators2;
+       vcdev->indicators2 = 0;
+       ccw->cmd_code = CCW_CMD_SET_CONF_IND;
+       ccw->flags = 0;
+       ccw->count = sizeof(vcdev->indicators2);
+       ccw->cda = (__u32)(unsigned long) indicatorp;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_CONF_IND);
+       if (ret)
+               goto out;
+ 
+       kfree(indicatorp);
+       kfree(ccw);
+       return 0;
+ out:
+       kfree(indicatorp);
+       kfree(ccw);
+       virtio_ccw_del_vqs(vdev);
+       return ret;
+ }
+ 
+ static void virtio_ccw_reset(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct ccw1 *ccw;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+ 
+       /* Zero status bits. */
+       *vcdev->status = 0;
+ 
+       /* Send a reset ccw on device. */
+       ccw->cmd_code = CCW_CMD_VDEV_RESET;
+       ccw->flags = 0;
+       ccw->count = 0;
+       ccw->cda = 0;
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_RESET);
+       kfree(ccw);
+ }
+ 
+ static u32 virtio_ccw_get_features(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct virtio_feature_desc *features;
+       int ret, rc;
+       struct ccw1 *ccw;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return 0;
+ 
+       features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL);
+       if (!features) {
+               rc = 0;
+               goto out_free;
+       }
+       /* Read the feature bits from the host. */
+       /* TODO: Features > 32 bits */
+       features->index = 0;
+       ccw->cmd_code = CCW_CMD_READ_FEAT;
+       ccw->flags = 0;
+       ccw->count = sizeof(*features);
+       ccw->cda = (__u32)(unsigned long)features;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_FEAT);
+       if (ret) {
+               rc = 0;
+               goto out_free;
+       }
+ 
+       rc = le32_to_cpu(features->features);
+ 
+ out_free:
+       kfree(features);
+       kfree(ccw);
+       return rc;
+ }
+ 
+ static void virtio_ccw_finalize_features(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct virtio_feature_desc *features;
+       int i;
+       struct ccw1 *ccw;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+ 
+       features = kzalloc(sizeof(*features), GFP_DMA | GFP_KERNEL);
+       if (!features)
+               goto out_free;
+ 
+       /* Give virtio_ring a chance to accept features. */
+       vring_transport_features(vdev);
+ 
+       for (i = 0; i < sizeof(*vdev->features) / sizeof(features->features);
+            i++) {
+               int highbits = i % 2 ? 32 : 0;
+               features->index = i;
+               features->features = cpu_to_le32(vdev->features[i / 2]
+                                                >> highbits);
+               /* Write the feature bits to the host. */
+               ccw->cmd_code = CCW_CMD_WRITE_FEAT;
+               ccw->flags = 0;
+               ccw->count = sizeof(*features);
+               ccw->cda = (__u32)(unsigned long)features;
+               ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_FEAT);
+       }
+ out_free:
+       kfree(features);
+       kfree(ccw);
+ }
+ 
+ static void virtio_ccw_get_config(struct virtio_device *vdev,
+                                 unsigned int offset, void *buf, unsigned len)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       int ret;
+       struct ccw1 *ccw;
+       void *config_area;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+ 
+       config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL);
+       if (!config_area)
+               goto out_free;
+ 
+       /* Read the config area from the host. */
+       ccw->cmd_code = CCW_CMD_READ_CONF;
+       ccw->flags = 0;
+       ccw->count = offset + len;
+       ccw->cda = (__u32)(unsigned long)config_area;
+       ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_READ_CONFIG);
+       if (ret)
+               goto out_free;
+ 
+       memcpy(vcdev->config, config_area, sizeof(vcdev->config));
+       memcpy(buf, &vcdev->config[offset], len);
+ 
+ out_free:
+       kfree(config_area);
+       kfree(ccw);
+ }
+ 
+ static void virtio_ccw_set_config(struct virtio_device *vdev,
+                                 unsigned int offset, const void *buf,
+                                 unsigned len)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct ccw1 *ccw;
+       void *config_area;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+ 
+       config_area = kzalloc(VIRTIO_CCW_CONFIG_SIZE, GFP_DMA | GFP_KERNEL);
+       if (!config_area)
+               goto out_free;
+ 
+       memcpy(&vcdev->config[offset], buf, len);
+       /* Write the config area to the host. */
+       memcpy(config_area, vcdev->config, sizeof(vcdev->config));
+       ccw->cmd_code = CCW_CMD_WRITE_CONF;
+       ccw->flags = 0;
+       ccw->count = offset + len;
+       ccw->cda = (__u32)(unsigned long)config_area;
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_CONFIG);
+ 
+ out_free:
+       kfree(config_area);
+       kfree(ccw);
+ }
+ 
+ static u8 virtio_ccw_get_status(struct virtio_device *vdev)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+ 
+       return *vcdev->status;
+ }
+ 
+ static void virtio_ccw_set_status(struct virtio_device *vdev, u8 status)
+ {
+       struct virtio_ccw_device *vcdev = to_vc_device(vdev);
+       struct ccw1 *ccw;
+ 
+       ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
+       if (!ccw)
+               return;
+ 
+       /* Write the status to the host. */
+       *vcdev->status = status;
+       ccw->cmd_code = CCW_CMD_WRITE_STATUS;
+       ccw->flags = 0;
+       ccw->count = sizeof(status);
+       ccw->cda = (__u32)(unsigned long)vcdev->status;
+       ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_WRITE_STATUS);
+       kfree(ccw);
+ }
+ 
+ static struct virtio_config_ops virtio_ccw_config_ops = {
+       .get_features = virtio_ccw_get_features,
+       .finalize_features = virtio_ccw_finalize_features,
+       .get = virtio_ccw_get_config,
+       .set = virtio_ccw_set_config,
+       .get_status = virtio_ccw_get_status,
+       .set_status = virtio_ccw_set_status,
+       .reset = virtio_ccw_reset,
+       .find_vqs = virtio_ccw_find_vqs,
+       .del_vqs = virtio_ccw_del_vqs,
+ };
+ 
+ 
+ /*
+  * ccw bus driver related functions
+  */
+ 
+ static void virtio_ccw_release_dev(struct device *_d)
+ {
+       struct virtio_device *dev = container_of(_d, struct virtio_device,
+                                                dev);
+       struct virtio_ccw_device *vcdev = to_vc_device(dev);
+ 
+       kfree(vcdev->status);
+       kfree(vcdev->config_block);
+       kfree(vcdev);
+ }
+ 
+ static int irb_is_error(struct irb *irb)
+ {
+       if (scsw_cstat(&irb->scsw) != 0)
+               return 1;
+       if (scsw_dstat(&irb->scsw) & ~(DEV_STAT_CHN_END | DEV_STAT_DEV_END))
+               return 1;
+       if (scsw_cc(&irb->scsw) != 0)
+               return 1;
+       return 0;
+ }
+ 
+ static struct virtqueue *virtio_ccw_vq_by_ind(struct virtio_ccw_device *vcdev,
+                                             int index)
+ {
+       struct virtio_ccw_vq_info *info;
+       unsigned long flags;
+       struct virtqueue *vq;
+ 
+       vq = NULL;
+       spin_lock_irqsave(&vcdev->lock, flags);
+       list_for_each_entry(info, &vcdev->virtqueues, node) {
+               if (virtqueue_get_queue_index(info->vq) == index) {
+                       vq = info->vq;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&vcdev->lock, flags);
+       return vq;
+ }
+ 
+ static void virtio_ccw_int_handler(struct ccw_device *cdev,
+                                  unsigned long intparm,
+                                  struct irb *irb)
+ {
+       __u32 activity = intparm & VIRTIO_CCW_INTPARM_MASK;
+       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+       int i;
+       struct virtqueue *vq;
+       struct virtio_driver *drv;
+ 
+       /* Check if it's a notification from the host. */
+       if ((intparm == 0) &&
+           (scsw_stctl(&irb->scsw) ==
+            (SCSW_STCTL_ALERT_STATUS | SCSW_STCTL_STATUS_PEND))) {
+               /* OK */
+       }
+       if (irb_is_error(irb))
+               vcdev->err = -EIO; /* XXX - use real error */
+       if (vcdev->curr_io & activity) {
+               switch (activity) {
+               case VIRTIO_CCW_DOING_READ_FEAT:
+               case VIRTIO_CCW_DOING_WRITE_FEAT:
+               case VIRTIO_CCW_DOING_READ_CONFIG:
+               case VIRTIO_CCW_DOING_WRITE_CONFIG:
+               case VIRTIO_CCW_DOING_WRITE_STATUS:
+               case VIRTIO_CCW_DOING_SET_VQ:
+               case VIRTIO_CCW_DOING_SET_IND:
+               case VIRTIO_CCW_DOING_SET_CONF_IND:
+               case VIRTIO_CCW_DOING_RESET:
+               case VIRTIO_CCW_DOING_READ_VQ_CONF:
+                       vcdev->curr_io &= ~activity;
+                       wake_up(&vcdev->wait_q);
+                       break;
+               default:
+                       /* don't know what to do... */
+                       dev_warn(&cdev->dev, "Suspicious activity '%08x'\n",
+                                activity);
+                       WARN_ON(1);
+                       break;
+               }
+       }
+       for_each_set_bit(i, &vcdev->indicators,
+                        sizeof(vcdev->indicators) * BITS_PER_BYTE) {
+               /* The bit clear must happen before the vring kick. */
+               clear_bit(i, &vcdev->indicators);
+               barrier();
+               vq = virtio_ccw_vq_by_ind(vcdev, i);
+               vring_interrupt(0, vq);
+       }
+       if (test_bit(0, &vcdev->indicators2)) {
+               drv = container_of(vcdev->vdev.dev.driver,
+                                  struct virtio_driver, driver);
+ 
+               if (drv && drv->config_changed)
+                       drv->config_changed(&vcdev->vdev);
+               clear_bit(0, &vcdev->indicators2);
+       }
+ }
+ 
+ /*
+  * We usually want to autoonline all devices, but give the admin
+  * a way to exempt devices from this.
+  */
+ #define __DEV_WORDS ((__MAX_SUBCHANNEL + (8*sizeof(long) - 1)) / \
+                    (8*sizeof(long)))
+ static unsigned long devs_no_auto[__MAX_SSID + 1][__DEV_WORDS];
+ 
+ static char *no_auto = "";
+ 
+ module_param(no_auto, charp, 0444);
+ MODULE_PARM_DESC(no_auto, "list of ccw bus id ranges not to be auto-onlined");
+ 
+ static int virtio_ccw_check_autoonline(struct ccw_device *cdev)
+ {
+       struct ccw_dev_id id;
+ 
+       ccw_device_get_id(cdev, &id);
+       if (test_bit(id.devno, devs_no_auto[id.ssid]))
+               return 0;
+       return 1;
+ }
+ 
+ static void virtio_ccw_auto_online(void *data, async_cookie_t cookie)
+ {
+       struct ccw_device *cdev = data;
+       int ret;
+ 
+       ret = ccw_device_set_online(cdev);
+       if (ret)
+               dev_warn(&cdev->dev, "Failed to set online: %d\n", ret);
+ }
+ 
+ static int virtio_ccw_probe(struct ccw_device *cdev)
+ {
+       cdev->handler = virtio_ccw_int_handler;
+ 
+       if (virtio_ccw_check_autoonline(cdev))
+               async_schedule(virtio_ccw_auto_online, cdev);
+       return 0;
+ }
+ 
+ static void virtio_ccw_remove(struct ccw_device *cdev)
+ {
+       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+ 
+       if (cdev->online) {
+               unregister_virtio_device(&vcdev->vdev);
+               dev_set_drvdata(&cdev->dev, NULL);
+       }
+       cdev->handler = NULL;
+ }
+ 
+ static int virtio_ccw_offline(struct ccw_device *cdev)
+ {
+       struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+ 
+       unregister_virtio_device(&vcdev->vdev);
+       dev_set_drvdata(&cdev->dev, NULL);
+       return 0;
+ }
+ 
+ 
+ static int virtio_ccw_online(struct ccw_device *cdev)
+ {
+       int ret;
+       struct virtio_ccw_device *vcdev;
+ 
+       vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL);
+       if (!vcdev) {
+               dev_warn(&cdev->dev, "Could not get memory for virtio\n");
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       vcdev->config_block = kzalloc(sizeof(*vcdev->config_block),
+                                  GFP_DMA | GFP_KERNEL);
+       if (!vcdev->config_block) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       vcdev->status = kzalloc(sizeof(*vcdev->status), GFP_DMA | GFP_KERNEL);
+       if (!vcdev->status) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+ 
+       vcdev->vdev.dev.parent = &cdev->dev;
+       vcdev->vdev.dev.release = virtio_ccw_release_dev;
+       vcdev->vdev.config = &virtio_ccw_config_ops;
+       vcdev->cdev = cdev;
+       init_waitqueue_head(&vcdev->wait_q);
+       INIT_LIST_HEAD(&vcdev->virtqueues);
+       spin_lock_init(&vcdev->lock);
+ 
+       dev_set_drvdata(&cdev->dev, vcdev);
+       vcdev->vdev.id.vendor = cdev->id.cu_type;
+       vcdev->vdev.id.device = cdev->id.cu_model;
+       ret = register_virtio_device(&vcdev->vdev);
+       if (ret) {
+               dev_warn(&cdev->dev, "Failed to register virtio device: %d\n",
+                        ret);
+               goto out_put;
+       }
+       return 0;
+ out_put:
+       dev_set_drvdata(&cdev->dev, NULL);
+       put_device(&vcdev->vdev.dev);
+       return ret;
+ out_free:
+       if (vcdev) {
+               kfree(vcdev->status);
+               kfree(vcdev->config_block);
+       }
+       kfree(vcdev);
+       return ret;
+ }
+ 
+ static int virtio_ccw_cio_notify(struct ccw_device *cdev, int event)
+ {
+       /* TODO: Check whether we need special handling here. */
+       return 0;
+ }
+ 
+ static struct ccw_device_id virtio_ids[] = {
+       { CCW_DEVICE(0x3832, 0) },
+       {},
+ };
+ MODULE_DEVICE_TABLE(ccw, virtio_ids);
+ 
+ static struct ccw_driver virtio_ccw_driver = {
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = "virtio_ccw",
+       },
+       .ids = virtio_ids,
+       .probe = virtio_ccw_probe,
+       .remove = virtio_ccw_remove,
+       .set_offline = virtio_ccw_offline,
+       .set_online = virtio_ccw_online,
+       .notify = virtio_ccw_cio_notify,
++      .int_class = IRQIO_VIR,
+ };
+ 
+ static int __init pure_hex(char **cp, unsigned int *val, int min_digit,
+                          int max_digit, int max_val)
+ {
+       int diff;
+ 
+       diff = 0;
+       *val = 0;
+ 
+       while (diff <= max_digit) {
+               int value = hex_to_bin(**cp);
+ 
+               if (value < 0)
+                       break;
+               *val = *val * 16 + value;
+               (*cp)++;
+               diff++;
+       }
+ 
+       if ((diff < min_digit) || (diff > max_digit) || (*val > max_val))
+               return 1;
+ 
+       return 0;
+ }
+ 
+ static int __init parse_busid(char *str, unsigned int *cssid,
+                             unsigned int *ssid, unsigned int *devno)
+ {
+       char *str_work;
+       int rc, ret;
+ 
+       rc = 1;
+ 
+       if (*str == '\0')
+               goto out;
+ 
+       str_work = str;
+       ret = pure_hex(&str_work, cssid, 1, 2, __MAX_CSSID);
+       if (ret || (str_work[0] != '.'))
+               goto out;
+       str_work++;
+       ret = pure_hex(&str_work, ssid, 1, 1, __MAX_SSID);
+       if (ret || (str_work[0] != '.'))
+               goto out;
+       str_work++;
+       ret = pure_hex(&str_work, devno, 4, 4, __MAX_SUBCHANNEL);
+       if (ret || (str_work[0] != '\0'))
+               goto out;
+ 
+       rc = 0;
+ out:
+       return rc;
+ }
+ 
+ static void __init no_auto_parse(void)
+ {
+       unsigned int from_cssid, to_cssid, from_ssid, to_ssid, from, to;
+       char *parm, *str;
+       int rc;
+ 
+       str = no_auto;
+       while ((parm = strsep(&str, ","))) {
+               rc = parse_busid(strsep(&parm, "-"), &from_cssid,
+                                &from_ssid, &from);
+               if (rc)
+                       continue;
+               if (parm != NULL) {
+                       rc = parse_busid(parm, &to_cssid,
+                                        &to_ssid, &to);
+                       if ((from_ssid > to_ssid) ||
+                           ((from_ssid == to_ssid) && (from > to)))
+                               rc = -EINVAL;
+               } else {
+                       to_cssid = from_cssid;
+                       to_ssid = from_ssid;
+                       to = from;
+               }
+               if (rc)
+                       continue;
+               while ((from_ssid < to_ssid) ||
+                      ((from_ssid == to_ssid) && (from <= to))) {
+                       set_bit(from, devs_no_auto[from_ssid]);
+                       from++;
+                       if (from > __MAX_SUBCHANNEL) {
+                               from_ssid++;
+                               from = 0;
+                       }
+               }
+       }
+ }
+ 
+ static int __init virtio_ccw_init(void)
+ {
+       /* parse no_auto string before we do anything further */
+       no_auto_parse();
+       return ccw_driver_register(&virtio_ccw_driver);
+ }
+ module_init(virtio_ccw_init);
+ 
+ static void __exit virtio_ccw_exit(void)
+ {
+       ccw_driver_unregister(&virtio_ccw_driver);
+ }
+ module_exit(virtio_ccw_exit);
diff --combined include/linux/kvm_host.h

index b7996a768eb2c656417fdb082f62871a89fc6f1d,722cae78bbc47d932cd41f2989ee4b7b22d85ba8..cad77fe09d770e47a03f8a802bd09fd30b3d136b
--- 1/include/linux/kvm_host.h
--- 2/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@@ -22,7 -22,6 +22,7 @@@
   #include <linux/rcupdate.h>
   #include <linux/ratelimit.h>
   #include <linux/err.h>
+ +#include <linux/irqflags.h>
   #include <asm/signal.h>
   
   #include <linux/kvm.h>
@@@ -123,6 -122,8 +123,8 @@@ static inline bool is_error_page(struc
   #define KVM_REQ_WATCHDOG          18
   #define KVM_REQ_MASTERCLOCK_UPDATE 19
   #define KVM_REQ_MCLOCK_INPROGRESS 20
+ #define KVM_REQ_EPR_EXIT          21
+ #define KVM_REQ_EOIBITMAP         22
   
   #define KVM_USERSPACE_IRQ_SOURCE_ID           0
   #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID      1
@@@ -267,12 -268,11 +269,11 @@@ static inline int kvm_vcpu_exiting_gues
   struct kvm_memory_slot {
         gfn_t base_gfn;
         unsigned long npages;
-       unsigned long flags;
         unsigned long *dirty_bitmap;
         struct kvm_arch_memory_slot arch;
         unsigned long userspace_addr;
-       int user_alloc;
-       int id;
+       u32 flags;
+       short id;
   };
   
   static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
@@@ -314,8 -314,12 +315,12 @@@ struct kvm_irq_routing_table {}
   
   #endif
   
+ #ifndef KVM_PRIVATE_MEM_SLOTS
+ #define KVM_PRIVATE_MEM_SLOTS 0
+ #endif
+ 
   #ifndef KVM_MEM_SLOTS_NUM
- #define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+ #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
   #endif
   
   /*
@@@ -327,7 -331,7 +332,7 @@@ struct kvm_memslots 
         u64 generation;
         struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
         /* The mapping table from slot id to the index in memslots[]. */
-       int id_to_index[KVM_MEM_SLOTS_NUM];
+       short id_to_index[KVM_MEM_SLOTS_NUM];
   };
   
   struct kvm {
@@@ -425,7 -429,8 +430,8 @@@ void kvm_exit(void)
   
   void kvm_get_kvm(struct kvm *kvm);
   void kvm_put_kvm(struct kvm *kvm);
- void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new);
+ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
+                    u64 last_generation);
   
   static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
   {
@@@ -448,10 -453,10 +454,10 @@@ id_to_memslot(struct kvm_memslots *slot
   
   int kvm_set_memory_region(struct kvm *kvm,
                           struct kvm_userspace_memory_region *mem,
-                         int user_alloc);
+                         bool user_alloc);
   int __kvm_set_memory_region(struct kvm *kvm,
                             struct kvm_userspace_memory_region *mem,
-                           int user_alloc);
+                           bool user_alloc);
   void kvm_arch_free_memslot(struct kvm_memory_slot *free,
                            struct kvm_memory_slot *dont);
   int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
@@@ -459,11 -464,11 +465,11 @@@ int kvm_arch_prepare_memory_region(stru
                                 struct kvm_memory_slot *memslot,
                                 struct kvm_memory_slot old,
                                 struct kvm_userspace_memory_region *mem,
-                               int user_alloc);
+                               bool user_alloc);
   void kvm_arch_commit_memory_region(struct kvm *kvm,
                                 struct kvm_userspace_memory_region *mem,
                                 struct kvm_memory_slot old,
-                               int user_alloc);
+                               bool user_alloc);
   bool kvm_largepages_enabled(void);
   void kvm_disable_largepages(void);
   /* flush all memory translations */
@@@ -533,6 -538,7 +539,7 @@@ void kvm_put_guest_fpu(struct kvm_vcpu 
   void kvm_flush_remote_tlbs(struct kvm *kvm);
   void kvm_reload_remote_mmus(struct kvm *kvm);
   void kvm_make_mclock_inprogress_request(struct kvm *kvm);
+ void kvm_make_update_eoibitmap_request(struct kvm *kvm);
   
   long kvm_arch_dev_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg);
@@@ -550,7 -556,7 +557,7 @@@ int kvm_vm_ioctl_get_dirty_log(struct k
   int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
                                    struct
                                    kvm_userspace_memory_region *mem,
-                                  int user_alloc);
+                                  bool user_alloc);
   int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
   long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg);
@@@ -686,6 -692,7 +693,7 @@@ int kvm_set_irq(struct kvm *kvm, int ir
   int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
   int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
                 int irq_source_id, int level);
+ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
   void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
   void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                    struct kvm_irq_ack_notifier *kian);
@@@ -741,52 -748,15 +749,52 @@@ static inline int kvm_deassign_device(s
   }
   #endif /* CONFIG_IOMMU_API */
   
- -static inline void kvm_guest_enter(void)
+ +static inline void __guest_enter(void)
   {
- -      BUG_ON(preemptible());
         /*
          * This is running in ioctl context so we can avoid
          * the call to vtime_account() with its unnecessary idle check.
          */
- -      vtime_account_system_irqsafe(current);
+ +      vtime_account_system(current);
         current->flags |= PF_VCPU;
+ +}
+ +
+ +static inline void __guest_exit(void)
+ +{
+ +      /*
+ +       * This is running in ioctl context so we can avoid
+ +       * the call to vtime_account() with its unnecessary idle check.
+ +       */
+ +      vtime_account_system(current);
+ +      current->flags &= ~PF_VCPU;
+ +}
+ +
+ +#ifdef CONFIG_CONTEXT_TRACKING
+ +extern void guest_enter(void);
+ +extern void guest_exit(void);
+ +
+ +#else /* !CONFIG_CONTEXT_TRACKING */
+ +static inline void guest_enter(void)
+ +{
+ +      __guest_enter();
+ +}
+ +
+ +static inline void guest_exit(void)
+ +{
+ +      __guest_exit();
+ +}
+ +#endif /* !CONFIG_CONTEXT_TRACKING */
+ +
+ +static inline void kvm_guest_enter(void)
+ +{
+ +      unsigned long flags;
+ +
+ +      BUG_ON(preemptible());
+ +
+ +      local_irq_save(flags);
+ +      guest_enter();
+ +      local_irq_restore(flags);
+ +
         /* KVM does not hold any references to rcu protected data when it
          * switches CPU into a guest mode. In fact switching to a guest mode
          * is very similar to exiting to userspase from rcu point of view. In
@@@ -799,11 -769,12 +807,11 @@@
   
   static inline void kvm_guest_exit(void)
   {
- -      /*
- -       * This is running in ioctl context so we can avoid
- -       * the call to vtime_account() with its unnecessary idle check.
- -       */
- -      vtime_account_system_irqsafe(current);
- -      current->flags &= ~PF_VCPU;
+ +      unsigned long flags;
+ +
+ +      local_irq_save(flags);
+ +      guest_exit();
+ +      local_irq_restore(flags);
   }
   
   /*
diff --combined include/uapi/linux/kvm.h

index c70577cf67bc23b03fdb48c1caf5911e64e787bd,9a2db5767ed5913a69fe2383e166842a7d619e6c..3c56ba3d80c16007f9eda468f96bf337f998c1a9
--- 1/include/uapi/linux/kvm.h
--- 2/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@@ -115,7 -115,6 +115,7 @@@ struct kvm_irq_level 
          * ACPI gsi notion of irq.
          * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
          * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+ +       * For ARM: See Documentation/virtual/kvm/api.txt
          */
         union {
                 __u32 irq;
@@@ -169,6 -168,8 +169,8 @@@ struct kvm_pit_config 
   #define KVM_EXIT_PAPR_HCALL     19
   #define KVM_EXIT_S390_UCONTROL          20
   #define KVM_EXIT_WATCHDOG         21
+ #define KVM_EXIT_S390_TSCH        22
+ #define KVM_EXIT_EPR              23
   
   /* For KVM_EXIT_INTERNAL_ERROR */
   /* Emulate instruction failed. */
@@@ -286,6 -287,19 +288,19 @@@ struct kvm_run 
                         __u64 ret;
                         __u64 args[9];
                 } papr_hcall;
+               /* KVM_EXIT_S390_TSCH */
+               struct {
+                       __u16 subchannel_id;
+                       __u16 subchannel_nr;
+                       __u32 io_int_parm;
+                       __u32 io_int_word;
+                       __u32 ipb;
+                       __u8 dequeued;
+               } s390_tsch;
+               /* KVM_EXIT_EPR */
+               struct {
+                       __u32 epr;
+               } epr;
                 /* Fix the size of the union. */
                 char padding[256];
         };
@@@ -398,10 -412,20 +413,20 @@@ struct kvm_s390_psw 
   #define KVM_S390_PROGRAM_INT          0xfffe0001u
   #define KVM_S390_SIGP_SET_PREFIX      0xfffe0002u
   #define KVM_S390_RESTART              0xfffe0003u
+ #define KVM_S390_MCHK                 0xfffe1000u
   #define KVM_S390_INT_VIRTIO           0xffff2603u
   #define KVM_S390_INT_SERVICE          0xffff2401u
   #define KVM_S390_INT_EMERGENCY                0xffff1201u
   #define KVM_S390_INT_EXTERNAL_CALL    0xffff1202u
+ /* Anything below 0xfffe0000u is taken by INT_IO */
+ #define KVM_S390_INT_IO(ai,cssid,ssid,schid)   \
+       (((schid)) |                           \
+        ((ssid) << 16) |                      \
+        ((cssid) << 18) |                     \
+        ((ai) << 26))
+ #define KVM_S390_INT_IO_MIN           0x00000000u
+ #define KVM_S390_INT_IO_MAX           0xfffdffffu
+ 
   
   struct kvm_s390_interrupt {
         __u32 type;
@@@ -636,8 -660,8 +661,10 @@@ struct kvm_ppc_smmu_info 
   #define KVM_CAP_IRQFD_RESAMPLE 82
   #define KVM_CAP_PPC_BOOKE_WATCHDOG 83
   #define KVM_CAP_PPC_HTAB_FD 84
+ #define KVM_CAP_S390_CSS_SUPPORT 85
+ #define KVM_CAP_PPC_EPR 86
+ +#define KVM_CAP_ARM_PSCI 87
+ +#define KVM_CAP_ARM_SET_DEVICE_ADDR 88
   
   #ifdef KVM_CAP_IRQ_ROUTING
   
@@@ -767,11 -791,6 +794,11 @@@ struct kvm_dirty_tlb 
   #define KVM_REG_SIZE_U512     0x0060000000000000ULL
   #define KVM_REG_SIZE_U1024    0x0070000000000000ULL
   
+ +struct kvm_reg_list {
+ +      __u64 n; /* number of regs */
+ +      __u64 reg[0];
+ +};
+ +
   struct kvm_one_reg {
         __u64 id;
         __u64 addr;
@@@ -785,11 -804,6 +812,11 @@@ struct kvm_msi 
         __u8  pad[16];
   };
   
+ +struct kvm_arm_device_addr {
+ +      __u64 id;
+ +      __u64 addr;
+ +};
+ +
   /*
    * ioctls for VM fds
    */
@@@ -875,8 -889,6 +902,8 @@@ struct kvm_s390_ucas_mapping 
   #define KVM_ALLOCATE_RMA        _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
   /* Available with KVM_CAP_PPC_HTAB_FD */
   #define KVM_PPC_GET_HTAB_FD     _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
+ +/* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
+ +#define KVM_ARM_SET_DEVICE_ADDR         _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
   
   /*
    * ioctls for vcpu fds
@@@ -947,8 -959,6 +974,8 @@@
   #define KVM_SET_ONE_REG                 _IOW(KVMIO,  0xac, struct kvm_one_reg)
   /* VM is being stopped by host */
   #define KVM_KVMCLOCK_CTRL       _IO(KVMIO,   0xad)
+ +#define KVM_ARM_VCPU_INIT       _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
+ +#define KVM_GET_REG_LIST        _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
   
   #define KVM_DEV_ASSIGN_ENABLE_IOMMU   (1 << 0)
   #define KVM_DEV_ASSIGN_PCI_2_3                (1 << 1)
diff --combined kernel/sched/core.c

index 053dfd7692d1ffafefa38698d9cc0e1277955013,01edad9b5d71f3603a0b1602e01272189e20301a..f1bdecf09afb593560f01309b791b5dcb1ed45d6
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -83,7 -83,7 +83,7 @@@
   #endif
   
   #include "sched.h"
- -#include "../workqueue_sched.h"
+ +#include "../workqueue_internal.h"
   #include "../smpboot.h"
   
   #define CREATE_TRACE_POINTS
@@@ -193,10 -193,23 +193,10 @@@ static void sched_feat_disable(int i) 
   static void sched_feat_enable(int i) { };
   #endif /* HAVE_JUMP_LABEL */
   
- -static ssize_t
- -sched_feat_write(struct file *filp, const char __user *ubuf,
- -              size_t cnt, loff_t *ppos)
+ +static int sched_feat_set(char *cmp)
   {
- -      char buf[64];
- -      char *cmp;
- -      int neg = 0;
         int i;
- -
- -      if (cnt > 63)
- -              cnt = 63;
- -
- -      if (copy_from_user(&buf, ubuf, cnt))
- -              return -EFAULT;
- -
- -      buf[cnt] = 0;
- -      cmp = strstrip(buf);
+ +      int neg = 0;
   
         if (strncmp(cmp, "NO_", 3) == 0) {
                 neg = 1;
@@@ -216,27 -229,6 +216,27 @@@
                 }
         }
   
+ +      return i;
+ +}
+ +
+ +static ssize_t
+ +sched_feat_write(struct file *filp, const char __user *ubuf,
+ +              size_t cnt, loff_t *ppos)
+ +{
+ +      char buf[64];
+ +      char *cmp;
+ +      int i;
+ +
+ +      if (cnt > 63)
+ +              cnt = 63;
+ +
+ +      if (copy_from_user(&buf, ubuf, cnt))
+ +              return -EFAULT;
+ +
+ +      buf[cnt] = 0;
+ +      cmp = strstrip(buf);
+ +
+ +      i = sched_feat_set(cmp);
         if (i == __SCHED_FEAT_NR)
                 return -EINVAL;
   
@@@ -1132,28 -1124,18 +1132,28 @@@ EXPORT_SYMBOL_GPL(kick_process)
    */
   static int select_fallback_rq(int cpu, struct task_struct *p)
   {
- -      const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ +      int nid = cpu_to_node(cpu);
+ +      const struct cpumask *nodemask = NULL;
         enum { cpuset, possible, fail } state = cpuset;
         int dest_cpu;
   
- -      /* Look for allowed, online CPU in same node. */
- -      for_each_cpu(dest_cpu, nodemask) {
- -              if (!cpu_online(dest_cpu))
- -                      continue;
- -              if (!cpu_active(dest_cpu))
- -                      continue;
- -              if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- -                      return dest_cpu;
+ +      /*
+ +       * If the node that the cpu is on has been offlined, cpu_to_node()
+ +       * will return -1. There is no cpu on the node, and we should
+ +       * select the cpu on the other node.
+ +       */
+ +      if (nid != -1) {
+ +              nodemask = cpumask_of_node(nid);
+ +
+ +              /* Look for allowed, online CPU in same node. */
+ +              for_each_cpu(dest_cpu, nodemask) {
+ +                      if (!cpu_online(dest_cpu))
+ +                              continue;
+ +                      if (!cpu_active(dest_cpu))
+ +                              continue;
+ +                      if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ +                              return dest_cpu;
+ +              }
         }
   
         for (;;) {
@@@ -1533,8 -1515,7 +1533,8 @@@ out
    */
   int wake_up_process(struct task_struct *p)
   {
- -      return try_to_wake_up(p, TASK_ALL, 0);
+ +      WARN_ON(task_is_stopped_or_traced(p));
+ +      return try_to_wake_up(p, TASK_NORMAL, 0);
   }
   EXPORT_SYMBOL(wake_up_process);
   
@@@ -1579,40 -1560,7 +1579,40 @@@ static void __sched_fork(struct task_st
   #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
   #endif
+ +
+ +#ifdef CONFIG_NUMA_BALANCING
+ +      if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ +              p->mm->numa_next_scan = jiffies;
+ +              p->mm->numa_next_reset = jiffies;
+ +              p->mm->numa_scan_seq = 0;
+ +      }
+ +
+ +      p->node_stamp = 0ULL;
+ +      p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ +      p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ +      p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ +      p->numa_work.next = &p->numa_work;
+ +#endif /* CONFIG_NUMA_BALANCING */
+ +}
+ +
+ +#ifdef CONFIG_NUMA_BALANCING
+ +#ifdef CONFIG_SCHED_DEBUG
+ +void set_numabalancing_state(bool enabled)
+ +{
+ +      if (enabled)
+ +              sched_feat_set("NUMA");
+ +      else
+ +              sched_feat_set("NO_NUMA");
   }
+ +#else
+ +__read_mostly bool numabalancing_enabled;
+ +
+ +void set_numabalancing_state(bool enabled)
+ +{
+ +      numabalancing_enabled = enabled;
+ +}
+ +#endif /* CONFIG_SCHED_DEBUG */
+ +#endif /* CONFIG_NUMA_BALANCING */
   
   /*
    * fork()/clone()-time setup:
@@@ -4108,14 -4056,8 +4108,14 @@@ long sched_setaffinity(pid_t pid, cons
                 goto out_free_cpus_allowed;
         }
         retval = -EPERM;
- -      if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
- -              goto out_unlock;
+ +      if (!check_same_owner(p)) {
+ +              rcu_read_lock();
+ +              if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ +                      rcu_read_unlock();
+ +                      goto out_unlock;
+ +              }
+ +              rcu_read_unlock();
+ +      }
   
         retval = security_task_setscheduler(p);
         if (retval)
@@@ -4374,20 -4316,32 +4374,32 @@@ EXPORT_SYMBOL(yield)
    * It's the caller's job to ensure that the target task struct
    * can't go away on us before we can do any checks.
    *
-  * Returns true if we indeed boosted the target task.
+  * Returns:
+  *    true (>0) if we indeed boosted the target task.
+  *    false (0) if we failed to boost the target.
+  *    -ESRCH if there's no task to yield to.
    */
   bool __sched yield_to(struct task_struct *p, bool preempt)
   {
         struct task_struct *curr = current;
         struct rq *rq, *p_rq;
         unsigned long flags;
- -      bool yielded = 0;
+ +      int yielded = 0;
   
         local_irq_save(flags);
         rq = this_rq();
   
   again:
         p_rq = task_rq(p);
+       /*
+        * If we're the only runnable task on the rq and target rq also
+        * has only one task, there's absolutely no point in yielding.
+        */
+       if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+               yielded = -ESRCH;
+               goto out_irq;
+       }
+ 
         double_rq_lock(rq, p_rq);
         while (task_rq(p) != p_rq) {
                 double_rq_unlock(rq, p_rq);
@@@ -4395,13 -4349,13 +4407,13 @@@
         }
   
         if (!curr->sched_class->yield_to_task)
-               goto out;
+               goto out_unlock;
   
         if (curr->sched_class != p->sched_class)
-               goto out;
+               goto out_unlock;
   
         if (task_running(p_rq, p) || p->state)
-               goto out;
+               goto out_unlock;
   
         yielded = curr->sched_class->yield_to_task(rq, p, preempt);
         if (yielded) {
@@@ -4414,11 -4368,12 +4426,12 @@@
                         resched_task(p_rq->curr);
         }
   
- out:
+ out_unlock:
         double_rq_unlock(rq, p_rq);
+ out_irq:
         local_irq_restore(flags);
   
-       if (yielded)
+       if (yielded > 0)
                 schedule();
   
         return yielded;
@@@ -4677,7 -4632,6 +4690,7 @@@ void __cpuinit init_idle(struct task_st
          */
         idle->sched_class = &idle_sched_class;
         ftrace_graph_init_idle_task(idle, cpu);
+ +      vtime_init_idle(idle);
   #if defined(CONFIG_SMP)
         sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
   #endif
@@@ -7171,6 -7125,7 +7184,6 @@@ static void free_sched_group(struct tas
   struct task_group *sched_create_group(struct task_group *parent)
   {
         struct task_group *tg;
- -      unsigned long flags;
   
         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
         if (!tg)
@@@ -7182,17 -7137,6 +7195,17 @@@
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
   
+ +      return tg;
+ +
+ +err:
+ +      free_sched_group(tg);
+ +      return ERR_PTR(-ENOMEM);
+ +}
+ +
+ +void sched_online_group(struct task_group *tg, struct task_group *parent)
+ +{
+ +      unsigned long flags;
+ +
         spin_lock_irqsave(&task_group_lock, flags);
         list_add_rcu(&tg->list, &task_groups);
   
@@@ -7202,6 -7146,12 +7215,6 @@@
         INIT_LIST_HEAD(&tg->children);
         list_add_rcu(&tg->siblings, &parent->children);
         spin_unlock_irqrestore(&task_group_lock, flags);
- -
- -      return tg;
- -
- -err:
- -      free_sched_group(tg);
- -      return ERR_PTR(-ENOMEM);
   }
   
   /* rcu callback to free various structures associated with a task group */
@@@ -7213,12 -7163,6 +7226,12 @@@ static void free_sched_group_rcu(struc
   
   /* Destroy runqueue etc associated with a task group */
   void sched_destroy_group(struct task_group *tg)
+ +{
+ +      /* wait for possible concurrent references to cfs_rqs complete */
+ +      call_rcu(&tg->rcu, free_sched_group_rcu);
+ +}
+ +
+ +void sched_offline_group(struct task_group *tg)
   {
         unsigned long flags;
         int i;
@@@ -7231,6 -7175,9 +7244,6 @@@
         list_del_rcu(&tg->list);
         list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
- -
- -      /* wait for possible concurrent references to cfs_rqs complete */
- -      call_rcu(&tg->rcu, free_sched_group_rcu);
   }
   
   /* change task's runqueue when it moves between groups.
@@@ -7526,25 -7473,6 +7539,25 @@@ static int sched_rt_global_constraints(
   }
   #endif /* CONFIG_RT_GROUP_SCHED */
   
+ +int sched_rr_handler(struct ctl_table *table, int write,
+ +              void __user *buffer, size_t *lenp,
+ +              loff_t *ppos)
+ +{
+ +      int ret;
+ +      static DEFINE_MUTEX(mutex);
+ +
+ +      mutex_lock(&mutex);
+ +      ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ +      /* make sure that internally we keep jiffies */
+ +      /* also, writing zero resets timeslice to default */
+ +      if (!ret && write) {
+ +              sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ +                      RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ +      }
+ +      mutex_unlock(&mutex);
+ +      return ret;
+ +}
+ +
   int sched_rt_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos)
@@@ -7601,19 -7529,6 +7614,19 @@@ static struct cgroup_subsys_state *cpu_
         return &tg->css;
   }
   
+ +static int cpu_cgroup_css_online(struct cgroup *cgrp)
+ +{
+ +      struct task_group *tg = cgroup_tg(cgrp);
+ +      struct task_group *parent;
+ +
+ +      if (!cgrp->parent)
+ +              return 0;
+ +
+ +      parent = cgroup_tg(cgrp->parent);
+ +      sched_online_group(tg, parent);
+ +      return 0;
+ +}
+ +
   static void cpu_cgroup_css_free(struct cgroup *cgrp)
   {
         struct task_group *tg = cgroup_tg(cgrp);
@@@ -7621,13 -7536,6 +7634,13 @@@
         sched_destroy_group(tg);
   }
   
+ +static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+ +{
+ +      struct task_group *tg = cgroup_tg(cgrp);
+ +
+ +      sched_offline_group(tg);
+ +}
+ +
   static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                  struct cgroup_taskset *tset)
   {
@@@ -7983,8 -7891,6 +7996,8 @@@ struct cgroup_subsys cpu_cgroup_subsys 
         .name           = "cpu",
         .css_alloc      = cpu_cgroup_css_alloc,
         .css_free       = cpu_cgroup_css_free,
+ +      .css_online     = cpu_cgroup_css_online,
+ +      .css_offline    = cpu_cgroup_css_offline,
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 24 Feb 2013 21:07:18 +0000 (13:07 -0800)
		1	2
Documentation/virtual/kvm/api.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/reg.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/asm-offsets.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_pr.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/emulate.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/irq.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/interrupt.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kvm/kvm-s390.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/kvm_para.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/vmx.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/uapi/asm/vmx.h	patch \|	diff1 \|	\|	blob \| history
arch/x86/kernel/kvmclock.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/s390/kvm/kvm_virtio.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/s390/kvm/virtio_ccw.c	patch \|	\|	diff2 \|	blob \| history
include/linux/kvm_host.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/uapi/linux/kvm.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history