From c43203cab1e2e193c43f8295f01dfb2a0721d9e5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 1 Jun 2016 22:26:00 +0200
Subject: [PATCH 001/302] KVM: x86: avoid simultaneous queueing of both IRQ and
 SMI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the processor exits to KVM while delivering an interrupt,
the hypervisor then requeues the interrupt for the next vmentry.
Trying to enter SMM in this same window causes to enter non-root
mode in emulated SMM (i.e. with IF=0) and with a request to
inject an IRQ (i.e. with a valid VM-entry interrupt info field).
This is invalid guest state (SDM 26.3.1.4 "Check on Guest RIP
and RFLAGS") and the processor fails vmentry.

The fix is to defer the injection from KVM_REQ_SMI to KVM_REQ_EVENT,
like we already do for e.g. NMIs.  This patch doesn't change the
name of the process_smi function so that it can be applied to
stable releases.  The next patch will modify the names so that
process_nmi and process_smi handle respectively KVM_REQ_NMI and
KVM_REQ_SMI.

This is especially common with Windows, probably due to the
self-IPI trick that it uses to deliver deferred procedure
calls (DPCs).

Reported-by: Laszlo Ersek <lersek@redhat.com>
Reported-by: Michał Zegan <webczat_200@poczta.onet.pl>
Fixes: 64d6067057d9658acb8675afcfba549abdb7fc16
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 902d9da123929b..5a26f8c066faeb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,6 +91,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
+static void process_smi(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -5302,13 +5303,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
 		/* This is a good place to trace that we are exiting SMM.  */
 		trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
 
-		if (unlikely(vcpu->arch.smi_pending)) {
-			kvm_make_request(KVM_REQ_SMI, vcpu);
-			vcpu->arch.smi_pending = 0;
-		} else {
-			/* Process a latched INIT, if any.  */
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
-		}
+		/* Process a latched INIT or SMI, if any.  */
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
 	}
 
 	kvm_mmu_reset_context(vcpu);
@@ -6108,7 +6104,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 	}
 
 	/* try to inject new event if pending */
-	if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+	if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+		vcpu->arch.smi_pending = false;
+		process_smi(vcpu);
+	} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
 		--vcpu->arch.nmi_pending;
 		vcpu->arch.nmi_injected = true;
 		kvm_x86_ops->set_nmi(vcpu);
@@ -6318,11 +6317,6 @@ static void process_smi(struct kvm_vcpu *vcpu)
 	char buf[512];
 	u32 cr0;
 
-	if (is_smm(vcpu)) {
-		vcpu->arch.smi_pending = true;
-		return;
-	}
-
 	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
 	vcpu->arch.hflags |= HF_SMM_MASK;
 	memset(buf, 0, 512);
@@ -6385,6 +6379,12 @@ static void process_smi(struct kvm_vcpu *vcpu)
 	kvm_mmu_reset_context(vcpu);
 }
 
+static void process_smi_request(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.smi_pending = true;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
 	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -6506,7 +6506,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
 		if (kvm_check_request(KVM_REQ_SMI, vcpu))
-			process_smi(vcpu);
+			process_smi_request(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@ -6579,8 +6579,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		if (inject_pending_event(vcpu, req_int_win) != 0)
 			req_immediate_exit = true;
-		/* enable NMI/IRQ window open exits if needed */
 		else {
+			/* Enable NMI/IRQ window open exits if needed.
+			 *
+			 * SMIs have two cases: 1) they can be nested, and
+			 * then there is nothing to do here because RSM will
+			 * cause a vmexit anyway; 2) or the SMI can be pending
+			 * because inject_pending_event has completed the
+			 * injection of an IRQ or NMI from the previous vmexit,
+			 * and then we request an immediate exit to inject the SMI.
+			 */
+			if (vcpu->arch.smi_pending && !is_smm(vcpu))
+				req_immediate_exit = true;
 			if (vcpu->arch.nmi_pending)
 				kvm_x86_ops->enable_nmi_window(vcpu);
 			if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -6631,8 +6641,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	kvm_load_guest_xcr0(vcpu);
 
-	if (req_immediate_exit)
+	if (req_immediate_exit) {
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		smp_send_reschedule(vcpu->cpu);
+	}
 
 	trace_kvm_entry(vcpu->vcpu_id);
 	wait_lapic_expire(vcpu);
@@ -7433,6 +7445,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	vcpu->arch.hflags = 0;
 
+	vcpu->arch.smi_pending = 0;
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;

From ee2cd4b7555e3a629f399c3ef228ceb42067e7af Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 1 Jun 2016 22:26:01 +0200
Subject: [PATCH 002/302] KVM: x86: rename process_smi to enter_smm,
 process_smi_request to process_smi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the function names more similar between KVM_REQ_NMI and KVM_REQ_SMI.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a26f8c066faeb..1785415ebff3bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
-static void process_smi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -6106,7 +6106,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 	/* try to inject new event if pending */
 	if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
 		vcpu->arch.smi_pending = false;
-		process_smi(vcpu);
+		enter_smm(vcpu);
 	} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
 		--vcpu->arch.nmi_pending;
 		vcpu->arch.nmi_injected = true;
@@ -6130,6 +6130,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 			kvm_x86_ops->set_irq(vcpu);
 		}
 	}
+
 	return 0;
 }
 
@@ -6153,7 +6154,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 #define put_smstate(type, buf, offset, val)			  \
 	*(type *)((buf) + (offset) - 0x7e00) = val
 
-static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
 {
 	u32 flags = 0;
 	flags |= seg->g       << 23;
@@ -6167,7 +6168,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
 	return flags;
 }
 
-static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 {
 	struct kvm_segment seg;
 	int offset;
@@ -6182,11 +6183,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 
 	put_smstate(u32, buf, offset + 8, seg.base);
 	put_smstate(u32, buf, offset + 4, seg.limit);
-	put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+	put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
 }
 
 #ifdef CONFIG_X86_64
-static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
 	struct kvm_segment seg;
 	int offset;
@@ -6195,7 +6196,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 	kvm_get_segment(vcpu, &seg, n);
 	offset = 0x7e00 + n * 16;
 
-	flags = process_smi_get_segment_flags(&seg) >> 8;
+	flags = enter_smm_get_segment_flags(&seg) >> 8;
 	put_smstate(u16, buf, offset, seg.selector);
 	put_smstate(u16, buf, offset + 2, flags);
 	put_smstate(u32, buf, offset + 4, seg.limit);
@@ -6203,7 +6204,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 }
 #endif
 
-static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
 	struct desc_ptr dt;
 	struct kvm_segment seg;
@@ -6227,13 +6228,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 	put_smstate(u32, buf, 0x7fc4, seg.selector);
 	put_smstate(u32, buf, 0x7f64, seg.base);
 	put_smstate(u32, buf, 0x7f60, seg.limit);
-	put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+	put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
 
 	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
 	put_smstate(u32, buf, 0x7fc0, seg.selector);
 	put_smstate(u32, buf, 0x7f80, seg.base);
 	put_smstate(u32, buf, 0x7f7c, seg.limit);
-	put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
 
 	kvm_x86_ops->get_gdt(vcpu, &dt);
 	put_smstate(u32, buf, 0x7f74, dt.address);
@@ -6244,7 +6245,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 	put_smstate(u32, buf, 0x7f54, dt.size);
 
 	for (i = 0; i < 6; i++)
-		process_smi_save_seg_32(vcpu, buf, i);
+		enter_smm_save_seg_32(vcpu, buf, i);
 
 	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
 
@@ -6253,7 +6254,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
 }
 
-static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 {
 #ifdef CONFIG_X86_64
 	struct desc_ptr dt;
@@ -6285,7 +6286,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 
 	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
 	put_smstate(u16, buf, 0x7e90, seg.selector);
-	put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+	put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
 	put_smstate(u32, buf, 0x7e94, seg.limit);
 	put_smstate(u64, buf, 0x7e98, seg.base);
 
@@ -6295,7 +6296,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 
 	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
 	put_smstate(u16, buf, 0x7e70, seg.selector);
-	put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+	put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
 	put_smstate(u32, buf, 0x7e74, seg.limit);
 	put_smstate(u64, buf, 0x7e78, seg.base);
 
@@ -6304,13 +6305,13 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 	put_smstate(u64, buf, 0x7e68, dt.address);
 
 	for (i = 0; i < 6; i++)
-		process_smi_save_seg_64(vcpu, buf, i);
+		enter_smm_save_seg_64(vcpu, buf, i);
 #else
 	WARN_ON_ONCE(1);
 #endif
 }
 
-static void process_smi(struct kvm_vcpu *vcpu)
+static void enter_smm(struct kvm_vcpu *vcpu)
 {
 	struct kvm_segment cs, ds;
 	struct desc_ptr dt;
@@ -6321,9 +6322,9 @@ static void process_smi(struct kvm_vcpu *vcpu)
 	vcpu->arch.hflags |= HF_SMM_MASK;
 	memset(buf, 0, 512);
 	if (guest_cpuid_has_longmode(vcpu))
-		process_smi_save_state_64(vcpu, buf);
+		enter_smm_save_state_64(vcpu, buf);
 	else
-		process_smi_save_state_32(vcpu, buf);
+		enter_smm_save_state_32(vcpu, buf);
 
 	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 
@@ -6379,7 +6380,7 @@ static void process_smi(struct kvm_vcpu *vcpu)
 	kvm_mmu_reset_context(vcpu);
 }
 
-static void process_smi_request(struct kvm_vcpu *vcpu)
+static void process_smi(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.smi_pending = true;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -6506,7 +6507,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
 		if (kvm_check_request(KVM_REQ_SMI, vcpu))
-			process_smi_request(vcpu);
+			process_smi(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))

From 250715a6171a076748be8ab88b274e72f0cfb435 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 1 Jun 2016 14:09:24 +0200
Subject: [PATCH 003/302] KVM: x86: protect KVM_CREATE_PIT/KVM_CREATE_PIT2 with
 kvm->lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The syzkaller folks reported a NULL pointer dereference that seems
to be cause by a race between KVM_CREATE_IRQCHIP and KVM_CREATE_PIT2.
The former takes kvm->lock (except when registering the devices,
which needs kvm->slots_lock); the latter takes kvm->slots_lock only.
Change KVM_CREATE_PIT2 to follow the same model as KVM_CREATE_IRQCHIP.

Testcase:

    #include <pthread.h>
    #include <linux/kvm.h>
    #include <fcntl.h>
    #include <sys/ioctl.h>
    #include <stdint.h>
    #include <string.h>
    #include <stdlib.h>
    #include <sys/syscall.h>
    #include <unistd.h>

    long r[23];

    void* thr1(void* arg)
    {
        struct kvm_pit_config pitcfg = { .flags = 4 };
        switch ((long)arg) {
        case 0: r[2]  = open("/dev/kvm", O_RDONLY|O_ASYNC);    break;
        case 1: r[3]  = ioctl(r[2], KVM_CREATE_VM, 0);         break;
        case 2: r[4]  = ioctl(r[3], KVM_CREATE_IRQCHIP, 0);    break;
        case 3: r[22] = ioctl(r[3], KVM_CREATE_PIT2, &pitcfg); break;
        }
        return 0;
    }

    int main(int argc, char **argv)
    {
        long i;
        pthread_t th[4];

        memset(r, -1, sizeof(r));
        for (i = 0; i < 4; i++) {
            pthread_create(&th[i], 0, thr, (void*)i);
            if (argc > 1 && rand()%2) usleep(rand()%1000);
        }
        usleep(20000);
        return 0;
    }

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/i8254.c | 4 +++-
 arch/x86/kvm/x86.c   | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index a4bf5b45d65a77..5fb6c620180e19 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
 	.write    = speaker_ioport_write,
 };
 
-/* Caller must hold slots_lock */
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
 	struct kvm_pit *pit;
@@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	kvm_pit_set_reinject(pit, true);
 
+	mutex_lock(&kvm->slots_lock);
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
 	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
 				      KVM_PIT_MEM_LENGTH, &pit->dev);
@@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 		if (ret < 0)
 			goto fail_register_speaker;
 	}
+	mutex_unlock(&kvm->slots_lock);
 
 	return pit;
 
 fail_register_speaker:
 	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
 fail_register_pit:
+	mutex_unlock(&kvm->slots_lock);
 	kvm_pit_set_reinject(pit, false);
 	kthread_stop(pit->worker_task);
 fail_kthread:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1785415ebff3bf..9d6a305936553d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3879,7 +3879,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 				   sizeof(struct kvm_pit_config)))
 			goto out;
 	create_pit:
-		mutex_lock(&kvm->slots_lock);
+		mutex_lock(&kvm->lock);
 		r = -EEXIST;
 		if (kvm->arch.vpit)
 			goto create_pit_unlock;
@@ -3888,7 +3888,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (kvm->arch.vpit)
 			r = 0;
 	create_pit_unlock:
-		mutex_unlock(&kvm->slots_lock);
+		mutex_unlock(&kvm->lock);
 		break;
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */

From dca4d728773a2f48e999c8617524bbf8dee4807f Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Tue, 31 May 2016 13:21:14 +0800
Subject: [PATCH 004/302] kvm/x86: remove unnecessary header file inclusion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch/x86/kvm/iommu.c includes <linux/intel-iommu.h> and <linux/dmar.h>, which
both are unnecessary, in fact incorrect to be here as they are intel specific.

Building kvm on x86 passed after removing above inclusion.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/iommu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 3069281904d385..4f2010c5feba71 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -28,9 +28,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/stat.h>
-#include <linux/dmar.h>
 #include <linux/iommu.h>
-#include <linux/intel-iommu.h>
 #include "assigned-dev.h"
 
 static bool allow_unsafe_assigned_interrupts;

From e65f30e0cb29694c4241bd9c96ea9413938fcec5 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Thu, 4 Feb 2016 10:24:52 +0100
Subject: [PATCH 005/302] s390: hypfs: Move diag implementation and data
 definitions

Diag 204 data and function definitions currently live in the hypfs
files. As KVM will be a consumer of this data, we need to make it
publicly available and move it to the appropriate diag.{c,h} files.

__attribute__ ((packed)) occurences were replaced with __packed for
all moved structs.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Acked-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/hypfs/hypfs_diag.c | 361 +++++++++++------------------------
 arch/s390/include/asm/diag.h | 127 ++++++++++++
 arch/s390/kernel/diag.c      |  22 +++
 3 files changed, 256 insertions(+), 254 deletions(-)

diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 045035796ca7d5..1e28414d7275e7 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -19,29 +19,10 @@
 #include <asm/ebcdic.h>
 #include "hypfs.h"
 
-#define LPAR_NAME_LEN 8		/* lpar name len in diag 204 data */
-#define CPU_NAME_LEN 16		/* type name len of cpus in diag224 name table */
 #define TMP_SIZE 64		/* size of temporary buffers */
 
 #define DBFS_D204_HDR_VERSION	0
 
-/* diag 204 subcodes */
-enum diag204_sc {
-	SUBC_STIB4 = 4,
-	SUBC_RSI = 5,
-	SUBC_STIB6 = 6,
-	SUBC_STIB7 = 7
-};
-
-/* The two available diag 204 data formats */
-enum diag204_format {
-	INFO_SIMPLE = 0,
-	INFO_EXT = 0x00010000
-};
-
-/* bit is set in flags, when physical cpu info is included in diag 204 data */
-#define LPAR_PHYS_FLG  0x80
-
 static char *diag224_cpu_names;			/* diag 224 name table */
 static enum diag204_sc diag204_store_sc;	/* used subcode for store */
 static enum diag204_format diag204_info_type;	/* used diag 204 data format */
@@ -53,7 +34,7 @@ static int diag204_buf_pages;		/* number of pages for diag204 data */
 static struct dentry *dbfs_d204_file;
 
 /*
- * DIAG 204 data structures and member access functions.
+ * DIAG 204 member access functions.
  *
  * Since we have two different diag 204 data formats for old and new s390
  * machines, we do not access the structs directly, but use getter functions for
@@ -62,302 +43,173 @@ static struct dentry *dbfs_d204_file;
 
 /* Time information block */
 
-struct info_blk_hdr {
-	__u8  npar;
-	__u8  flags;
-	__u16 tslice;
-	__u16 phys_cpus;
-	__u16 this_part;
-	__u64 curtod;
-} __attribute__ ((packed));
-
-struct x_info_blk_hdr {
-	__u8  npar;
-	__u8  flags;
-	__u16 tslice;
-	__u16 phys_cpus;
-	__u16 this_part;
-	__u64 curtod1;
-	__u64 curtod2;
-	char reserved[40];
-} __attribute__ ((packed));
-
 static inline int info_blk_hdr__size(enum diag204_format type)
 {
-	if (type == INFO_SIMPLE)
-		return sizeof(struct info_blk_hdr);
-	else /* INFO_EXT */
-		return sizeof(struct x_info_blk_hdr);
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_info_blk_hdr);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_info_blk_hdr);
 }
 
 static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct info_blk_hdr *)hdr)->npar;
-	else /* INFO_EXT */
-		return ((struct x_info_blk_hdr *)hdr)->npar;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_info_blk_hdr *)hdr)->npar;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
 }
 
 static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct info_blk_hdr *)hdr)->flags;
-	else /* INFO_EXT */
-		return ((struct x_info_blk_hdr *)hdr)->flags;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_info_blk_hdr *)hdr)->flags;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
 }
 
 static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct info_blk_hdr *)hdr)->phys_cpus;
-	else /* INFO_EXT */
-		return ((struct x_info_blk_hdr *)hdr)->phys_cpus;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
 }
 
 /* Partition header */
 
-struct part_hdr {
-	__u8 pn;
-	__u8 cpus;
-	char reserved[6];
-	char part_name[LPAR_NAME_LEN];
-} __attribute__ ((packed));
-
-struct x_part_hdr {
-	__u8  pn;
-	__u8  cpus;
-	__u8  rcpus;
-	__u8  pflag;
-	__u32 mlu;
-	char  part_name[LPAR_NAME_LEN];
-	char  lpc_name[8];
-	char  os_name[8];
-	__u64 online_cs;
-	__u64 online_es;
-	__u8  upid;
-	char  reserved1[3];
-	__u32 group_mlu;
-	char  group_name[8];
-	char  reserved2[32];
-} __attribute__ ((packed));
-
 static inline int part_hdr__size(enum diag204_format type)
 {
-	if (type == INFO_SIMPLE)
-		return sizeof(struct part_hdr);
-	else /* INFO_EXT */
-		return sizeof(struct x_part_hdr);
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_part_hdr);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_part_hdr);
 }
 
 static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct part_hdr *)hdr)->cpus;
-	else /* INFO_EXT */
-		return ((struct x_part_hdr *)hdr)->rcpus;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_part_hdr *)hdr)->cpus;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_part_hdr *)hdr)->rcpus;
 }
 
 static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
 				       char *name)
 {
-	if (type == INFO_SIMPLE)
-		memcpy(name, ((struct part_hdr *)hdr)->part_name,
-		       LPAR_NAME_LEN);
-	else /* INFO_EXT */
-		memcpy(name, ((struct x_part_hdr *)hdr)->part_name,
-		       LPAR_NAME_LEN);
-	EBCASC(name, LPAR_NAME_LEN);
-	name[LPAR_NAME_LEN] = 0;
+	if (type == DIAG204_INFO_SIMPLE)
+		memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+		       DIAG204_LPAR_NAME_LEN);
+	else /* DIAG204_INFO_EXT */
+		memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+		       DIAG204_LPAR_NAME_LEN);
+	EBCASC(name, DIAG204_LPAR_NAME_LEN);
+	name[DIAG204_LPAR_NAME_LEN] = 0;
 	strim(name);
 }
 
-struct cpu_info {
-	__u16 cpu_addr;
-	char  reserved1[2];
-	__u8  ctidx;
-	__u8  cflag;
-	__u16 weight;
-	__u64 acc_time;
-	__u64 lp_time;
-} __attribute__ ((packed));
-
-struct x_cpu_info {
-	__u16 cpu_addr;
-	char  reserved1[2];
-	__u8  ctidx;
-	__u8  cflag;
-	__u16 weight;
-	__u64 acc_time;
-	__u64 lp_time;
-	__u16 min_weight;
-	__u16 cur_weight;
-	__u16 max_weight;
-	char  reseved2[2];
-	__u64 online_time;
-	__u64 wait_time;
-	__u32 pma_weight;
-	__u32 polar_weight;
-	char  reserved3[40];
-} __attribute__ ((packed));
-
 /* CPU info block */
 
 static inline int cpu_info__size(enum diag204_format type)
 {
-	if (type == INFO_SIMPLE)
-		return sizeof(struct cpu_info);
-	else /* INFO_EXT */
-		return sizeof(struct x_cpu_info);
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_cpu_info);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_cpu_info);
 }
 
 static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct cpu_info *)hdr)->ctidx;
-	else /* INFO_EXT */
-		return ((struct x_cpu_info *)hdr)->ctidx;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->ctidx;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->ctidx;
 }
 
 static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct cpu_info *)hdr)->cpu_addr;
-	else /* INFO_EXT */
-		return ((struct x_cpu_info *)hdr)->cpu_addr;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
 }
 
 static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct cpu_info *)hdr)->acc_time;
-	else /* INFO_EXT */
-		return ((struct x_cpu_info *)hdr)->acc_time;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->acc_time;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->acc_time;
 }
 
 static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct cpu_info *)hdr)->lp_time;
-	else /* INFO_EXT */
-		return ((struct x_cpu_info *)hdr)->lp_time;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_cpu_info *)hdr)->lp_time;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->lp_time;
 }
 
 static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
+	if (type == DIAG204_INFO_SIMPLE)
 		return 0;	/* online_time not available in simple info */
-	else /* INFO_EXT */
-		return ((struct x_cpu_info *)hdr)->online_time;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_cpu_info *)hdr)->online_time;
 }
 
 /* Physical header */
 
-struct phys_hdr {
-	char reserved1[1];
-	__u8 cpus;
-	char reserved2[6];
-	char mgm_name[8];
-} __attribute__ ((packed));
-
-struct x_phys_hdr {
-	char reserved1[1];
-	__u8 cpus;
-	char reserved2[6];
-	char mgm_name[8];
-	char reserved3[80];
-} __attribute__ ((packed));
-
 static inline int phys_hdr__size(enum diag204_format type)
 {
-	if (type == INFO_SIMPLE)
-		return sizeof(struct phys_hdr);
-	else /* INFO_EXT */
-		return sizeof(struct x_phys_hdr);
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_phys_hdr);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_phys_hdr);
 }
 
 static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct phys_hdr *)hdr)->cpus;
-	else /* INFO_EXT */
-		return ((struct x_phys_hdr *)hdr)->cpus;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_hdr *)hdr)->cpus;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_hdr *)hdr)->cpus;
 }
 
 /* Physical CPU info block */
 
-struct phys_cpu {
-	__u16 cpu_addr;
-	char  reserved1[2];
-	__u8  ctidx;
-	char  reserved2[3];
-	__u64 mgm_time;
-	char  reserved3[8];
-} __attribute__ ((packed));
-
-struct x_phys_cpu {
-	__u16 cpu_addr;
-	char  reserved1[2];
-	__u8  ctidx;
-	char  reserved2[3];
-	__u64 mgm_time;
-	char  reserved3[80];
-} __attribute__ ((packed));
-
 static inline int phys_cpu__size(enum diag204_format type)
 {
-	if (type == INFO_SIMPLE)
-		return sizeof(struct phys_cpu);
-	else /* INFO_EXT */
-		return sizeof(struct x_phys_cpu);
+	if (type == DIAG204_INFO_SIMPLE)
+		return sizeof(struct diag204_phys_cpu);
+	else /* DIAG204_INFO_EXT */
+		return sizeof(struct diag204_x_phys_cpu);
 }
 
 static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct phys_cpu *)hdr)->cpu_addr;
-	else /* INFO_EXT */
-		return ((struct x_phys_cpu *)hdr)->cpu_addr;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
 }
 
 static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct phys_cpu *)hdr)->mgm_time;
-	else /* INFO_EXT */
-		return ((struct x_phys_cpu *)hdr)->mgm_time;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
 }
 
 static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
 {
-	if (type == INFO_SIMPLE)
-		return ((struct phys_cpu *)hdr)->ctidx;
-	else /* INFO_EXT */
-		return ((struct x_phys_cpu *)hdr)->ctidx;
+	if (type == DIAG204_INFO_SIMPLE)
+		return ((struct diag204_phys_cpu *)hdr)->ctidx;
+	else /* DIAG204_INFO_EXT */
+		return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
 }
 
 /* Diagnose 204 functions */
-
-static inline int __diag204(unsigned long subcode, unsigned long size, void *addr)
-{
-	register unsigned long _subcode asm("0") = subcode;
-	register unsigned long _size asm("1") = size;
-
-	asm volatile(
-		"	diag	%2,%0,0x204\n"
-		"0:\n"
-		EX_TABLE(0b,0b)
-		: "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
-	if (_subcode)
-		return -1;
-	return _size;
-}
-
-static int diag204(unsigned long subcode, unsigned long size, void *addr)
-{
-	diag_stat_inc(DIAG_STAT_X204);
-	return __diag204(subcode, size, addr);
-}
-
 /*
  * For the old diag subcode 4 with simple data format we have to use real
  * memory. If we use subcode 6 or 7 with extended data format, we can (and
@@ -409,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
 		*pages = diag204_buf_pages;
 		return diag204_buf;
 	}
-	if (fmt == INFO_SIMPLE) {
+	if (fmt == DIAG204_INFO_SIMPLE) {
 		*pages = 1;
 		return diag204_alloc_rbuf();
-	} else {/* INFO_EXT */
-		*pages = diag204((unsigned long)SUBC_RSI |
-				 (unsigned long)INFO_EXT, 0, NULL);
+	} else {/* DIAG204_INFO_EXT */
+		*pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+				 (unsigned long)DIAG204_INFO_EXT, 0, NULL);
 		if (*pages <= 0)
 			return ERR_PTR(-ENOSYS);
 		else
@@ -441,18 +293,18 @@ static int diag204_probe(void)
 	void *buf;
 	int pages, rc;
 
-	buf = diag204_get_buffer(INFO_EXT, &pages);
+	buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages);
 	if (!IS_ERR(buf)) {
-		if (diag204((unsigned long)SUBC_STIB7 |
-			    (unsigned long)INFO_EXT, pages, buf) >= 0) {
-			diag204_store_sc = SUBC_STIB7;
-			diag204_info_type = INFO_EXT;
+		if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
+			    (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+			diag204_store_sc = DIAG204_SUBC_STIB7;
+			diag204_info_type = DIAG204_INFO_EXT;
 			goto out;
 		}
-		if (diag204((unsigned long)SUBC_STIB6 |
-			    (unsigned long)INFO_EXT, pages, buf) >= 0) {
-			diag204_store_sc = SUBC_STIB6;
-			diag204_info_type = INFO_EXT;
+		if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
+			    (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+			diag204_store_sc = DIAG204_SUBC_STIB6;
+			diag204_info_type = DIAG204_INFO_EXT;
 			goto out;
 		}
 		diag204_free_buffer();
@@ -460,15 +312,15 @@ static int diag204_probe(void)
 
 	/* subcodes 6 and 7 failed, now try subcode 4 */
 
-	buf = diag204_get_buffer(INFO_SIMPLE, &pages);
+	buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages);
 	if (IS_ERR(buf)) {
 		rc = PTR_ERR(buf);
 		goto fail_alloc;
 	}
-	if (diag204((unsigned long)SUBC_STIB4 |
-		    (unsigned long)INFO_SIMPLE, pages, buf) >= 0) {
-		diag204_store_sc = SUBC_STIB4;
-		diag204_info_type = INFO_SIMPLE;
+	if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
+		    (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
+		diag204_store_sc = DIAG204_SUBC_STIB4;
+		diag204_info_type = DIAG204_INFO_SIMPLE;
 		goto out;
 	} else {
 		rc = -ENOSYS;
@@ -543,9 +395,9 @@ static void diag224_delete_name_table(void)
 
 static int diag224_idx2name(int index, char *name)
 {
-	memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN),
-		CPU_NAME_LEN);
-	name[CPU_NAME_LEN] = 0;
+	memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+	       DIAG204_CPU_NAME_LEN);
+	name[DIAG204_CPU_NAME_LEN] = 0;
 	strim(name);
 	return 0;
 }
@@ -601,7 +453,7 @@ __init int hypfs_diag_init(void)
 		pr_err("The hardware system does not support hypfs\n");
 		return -ENODATA;
 	}
-	if (diag204_info_type == INFO_EXT) {
+	if (diag204_info_type == DIAG204_INFO_EXT) {
 		rc = hypfs_dbfs_create_file(&dbfs_file_d204);
 		if (rc)
 			return rc;
@@ -649,7 +501,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
 			      cpu_info__lp_time(diag204_info_type, cpu_info));
 	if (IS_ERR(rc))
 		return PTR_ERR(rc);
-	if (diag204_info_type == INFO_EXT) {
+	if (diag204_info_type == DIAG204_INFO_EXT) {
 		rc = hypfs_create_u64(cpu_dir, "onlinetime",
 				      cpu_info__online_time(diag204_info_type,
 							    cpu_info));
@@ -665,12 +517,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
 {
 	struct dentry *cpus_dir;
 	struct dentry *lpar_dir;
-	char lpar_name[LPAR_NAME_LEN + 1];
+	char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
 	void *cpu_info;
 	int i;
 
 	part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
-	lpar_name[LPAR_NAME_LEN] = 0;
+	lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
 	lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
 	if (IS_ERR(lpar_dir))
 		return lpar_dir;
@@ -753,7 +605,8 @@ int hypfs_diag_create_files(struct dentry *root)
 			goto err_out;
 		}
 	}
-	if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) {
+	if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
+	    DIAG204_LPAR_PHYS_FLG) {
 		ptr = hypfs_create_phys_files(root, part_hdr);
 		if (IS_ERR(ptr)) {
 			rc = PTR_ERR(ptr);
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 5fac921c1c4210..f72744f14e3113 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -78,4 +78,131 @@ struct diag210 {
 
 extern int diag210(struct diag210 *addr);
 
+/* bit is set in flags, when physical cpu info is included in diag 204 data */
+#define DIAG204_LPAR_PHYS_FLG 0x80
+#define DIAG204_LPAR_NAME_LEN 8		/* lpar name len in diag 204 data */
+#define DIAG204_CPU_NAME_LEN 16		/* type name len of cpus in diag224 name table */
+
+/* diag 204 subcodes */
+enum diag204_sc {
+	DIAG204_SUBC_STIB4 = 4,
+	DIAG204_SUBC_RSI = 5,
+	DIAG204_SUBC_STIB6 = 6,
+	DIAG204_SUBC_STIB7 = 7
+};
+
+/* The two available diag 204 data formats */
+enum diag204_format {
+	DIAG204_INFO_SIMPLE = 0,
+	DIAG204_INFO_EXT = 0x00010000
+};
+
+struct diag204_info_blk_hdr {
+	__u8  npar;
+	__u8  flags;
+	__u16 tslice;
+	__u16 phys_cpus;
+	__u16 this_part;
+	__u64 curtod;
+} __packed;
+
+struct diag204_x_info_blk_hdr {
+	__u8  npar;
+	__u8  flags;
+	__u16 tslice;
+	__u16 phys_cpus;
+	__u16 this_part;
+	__u64 curtod1;
+	__u64 curtod2;
+	char reserved[40];
+} __packed;
+
+struct diag204_part_hdr {
+	__u8 pn;
+	__u8 cpus;
+	char reserved[6];
+	char part_name[DIAG204_LPAR_NAME_LEN];
+} __packed;
+
+struct diag204_x_part_hdr {
+	__u8  pn;
+	__u8  cpus;
+	__u8  rcpus;
+	__u8  pflag;
+	__u32 mlu;
+	char  part_name[DIAG204_LPAR_NAME_LEN];
+	char  lpc_name[8];
+	char  os_name[8];
+	__u64 online_cs;
+	__u64 online_es;
+	__u8  upid;
+	char  reserved1[3];
+	__u32 group_mlu;
+	char  group_name[8];
+	char  reserved2[32];
+} __packed;
+
+struct diag204_cpu_info {
+	__u16 cpu_addr;
+	char  reserved1[2];
+	__u8  ctidx;
+	__u8  cflag;
+	__u16 weight;
+	__u64 acc_time;
+	__u64 lp_time;
+} __packed;
+
+struct diag204_x_cpu_info {
+	__u16 cpu_addr;
+	char  reserved1[2];
+	__u8  ctidx;
+	__u8  cflag;
+	__u16 weight;
+	__u64 acc_time;
+	__u64 lp_time;
+	__u16 min_weight;
+	__u16 cur_weight;
+	__u16 max_weight;
+	char  reseved2[2];
+	__u64 online_time;
+	__u64 wait_time;
+	__u32 pma_weight;
+	__u32 polar_weight;
+	char  reserved3[40];
+} __packed;
+
+struct diag204_phys_hdr {
+	char reserved1[1];
+	__u8 cpus;
+	char reserved2[6];
+	char mgm_name[8];
+} __packed;
+
+struct diag204_x_phys_hdr {
+	char reserved1[1];
+	__u8 cpus;
+	char reserved2[6];
+	char mgm_name[8];
+	char reserved3[80];
+} __packed;
+
+struct diag204_phys_cpu {
+	__u16 cpu_addr;
+	char  reserved1[2];
+	__u8  ctidx;
+	char  reserved2[3];
+	__u64 mgm_time;
+	char  reserved3[8];
+} __packed;
+
+struct diag204_x_phys_cpu {
+	__u16 cpu_addr;
+	char  reserved1[2];
+	__u8  ctidx;
+	char  reserved2[3];
+	__u64 mgm_time;
+	char  reserved3[80];
+} __packed;
+
+int diag204(unsigned long subcode, unsigned long size, void *addr);
 #endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 48b37b8357e683..f4ce4a248811e4 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -162,6 +162,28 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
 }
 EXPORT_SYMBOL(diag14);
 
+static inline int __diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+	register unsigned long _subcode asm("0") = subcode;
+	register unsigned long _size asm("1") = size;
+
+	asm volatile(
+		"	diag	%2,%0,0x204\n"
+		"0:\n"
+		EX_TABLE(0b,0b)
+		: "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
+	if (_subcode)
+		return -1;
+	return _size;
+}
+
+int diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+	diag_stat_inc(DIAG_STAT_X204);
+	return __diag204(subcode, size, addr);
+}
+EXPORT_SYMBOL(diag204);
+
 /*
  * Diagnose 210: Get information about a virtual device
  */

From e435dc31398e63b992639cf62024d959219db191 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Mon, 8 Feb 2016 13:36:22 +0100
Subject: [PATCH 006/302] s390: Make cpc_name accessible

sclp_ocf.c is the only way to get the cpc name, as it registers the
sole event handler for the ocf event. By creating a new global
function that copies that name, we make it accessible to the world
which longs to retrieve it.

Additionally we now also store the cpc name as EBCDIC, so we don't
have to convert it to and from ASCII if it is requested in native
encoding.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h |  1 +
 drivers/s390/char/sclp_ocf.c | 23 +++++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index e4f6f73afe2f91..49736a0d4e0e9a 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -101,5 +101,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
 int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
 void sclp_early_detect(void);
 void _sclp_print_early(const char *);
+void sclp_ocf_cpc_name_copy(char *dst);
 
 #endif /* _ASM_S390_SCLP_H */
diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c
index 2553db0fdb5285..f59b71776bbd9f 100644
--- a/drivers/s390/char/sclp_ocf.c
+++ b/drivers/s390/char/sclp_ocf.c
@@ -26,7 +26,7 @@
 #define OCF_LENGTH_CPC_NAME 8UL
 
 static char hmc_network[OCF_LENGTH_HMC_NETWORK + 1];
-static char cpc_name[OCF_LENGTH_CPC_NAME + 1];
+static char cpc_name[OCF_LENGTH_CPC_NAME]; /* in EBCDIC */
 
 static DEFINE_SPINLOCK(sclp_ocf_lock);
 static struct work_struct sclp_ocf_change_work;
@@ -72,9 +72,8 @@ static void sclp_ocf_handler(struct evbuf_header *evbuf)
 	}
 	if (cpc) {
 		size = min(OCF_LENGTH_CPC_NAME, (size_t) cpc->length);
+		memset(cpc_name, 0, OCF_LENGTH_CPC_NAME);
 		memcpy(cpc_name, cpc + 1, size);
-		EBCASC(cpc_name, size);
-		cpc_name[size] = 0;
 	}
 	spin_unlock(&sclp_ocf_lock);
 	schedule_work(&sclp_ocf_change_work);
@@ -85,15 +84,23 @@ static struct sclp_register sclp_ocf_event = {
 	.receiver_fn = sclp_ocf_handler,
 };
 
+void sclp_ocf_cpc_name_copy(char *dst)
+{
+	spin_lock_irq(&sclp_ocf_lock);
+	memcpy(dst, cpc_name, OCF_LENGTH_CPC_NAME);
+	spin_unlock_irq(&sclp_ocf_lock);
+}
+EXPORT_SYMBOL(sclp_ocf_cpc_name_copy);
+
 static ssize_t cpc_name_show(struct kobject *kobj,
 			     struct kobj_attribute *attr, char *page)
 {
-	int rc;
+	char name[OCF_LENGTH_CPC_NAME + 1];
 
-	spin_lock_irq(&sclp_ocf_lock);
-	rc = snprintf(page, PAGE_SIZE, "%s\n", cpc_name);
-	spin_unlock_irq(&sclp_ocf_lock);
-	return rc;
+	sclp_ocf_cpc_name_copy(name);
+	name[OCF_LENGTH_CPC_NAME] = 0;
+	EBCASC(name, OCF_LENGTH_CPC_NAME);
+	return snprintf(page, PAGE_SIZE, "%s\n", name);
 }
 
 static struct kobj_attribute cpc_name_attr =

From 022bd2d11cc51f62e873a09bcae8016b10950194 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Fri, 12 Feb 2016 12:52:49 +0100
Subject: [PATCH 007/302] s390: Make diag224 public

Diag204's cpu structures only contain the cpu type by means of an
index in the diag224 name table. Hence, to be able to use diag204 in
any meaningful way, we also need a usable diag224 interface.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/hypfs/hypfs_diag.c | 14 --------------
 arch/s390/include/asm/diag.h |  1 +
 arch/s390/kernel/diag.c      | 15 +++++++++++++++
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 1e28414d7275e7..28f03ca60100a3 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -360,20 +360,6 @@ static void *diag204_store(void)
 
 /* Diagnose 224 functions */
 
-static int diag224(void *ptr)
-{
-	int rc = -EOPNOTSUPP;
-
-	diag_stat_inc(DIAG_STAT_X224);
-	asm volatile(
-		"	diag	%1,%2,0x224\n"
-		"0:	lhi	%0,0x0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "+d" (rc) :"d" (0), "d" (ptr) : "memory");
-	return rc;
-}
-
 static int diag224_get_name_table(void)
 {
 	/* memory must be below 2GB */
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index f72744f14e3113..197e303a76e96c 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -205,4 +205,5 @@ struct diag204_x_phys_cpu {
 } __packed;
 
 int diag204(unsigned long subcode, unsigned long size, void *addr);
+int diag224(void *ptr);
 #endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index f4ce4a248811e4..a44faf4a045442 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -218,3 +218,18 @@ int diag210(struct diag210 *addr)
 	return ccode;
 }
 EXPORT_SYMBOL(diag210);
+
+int diag224(void *ptr)
+{
+	int rc = -EOPNOTSUPP;
+
+	diag_stat_inc(DIAG_STAT_X224);
+	asm volatile(
+		"	diag	%1,%2,0x224\n"
+		"0:	lhi	%0,0x0\n"
+		"1:\n"
+		EX_TABLE(0b,1b)
+		: "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+	return rc;
+}
+EXPORT_SYMBOL(diag224);

From a011eeb2a3d6cd778eb63bea0bf149ebbe658ab5 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 14:14:01 +0200
Subject: [PATCH 008/302] KVM: s390: Add operation exception interception
 handler

This commit introduces code that handles operation exception
interceptions. With this handler we can emulate instructions by using
illegal opcodes.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  1 +
 arch/s390/kvm/intercept.c        | 11 +++++++++++
 arch/s390/kvm/kvm-s390.c         |  1 +
 arch/s390/kvm/trace.h            | 21 +++++++++++++++++++++
 4 files changed, 34 insertions(+)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 37b9017c6a96b7..093ea14109e2a1 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -255,6 +255,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_stctg;
 	u32 exit_program_interruption;
 	u32 exit_instr_and_program;
+	u32 exit_operation_exception;
 	u32 deliver_external_call;
 	u32 deliver_emergency_signal;
 	u32 deliver_service_signal;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 2e6b54e4d3f955..09c13db1416fdc 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -349,6 +349,15 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
 	return -EOPNOTSUPP;
 }
 
+static int handle_operexc(struct kvm_vcpu *vcpu)
+{
+	vcpu->stat.exit_operation_exception++;
+	trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+				      vcpu->arch.sie_block->ipb);
+
+	return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
 	if (kvm_is_ucontrol(vcpu->kvm))
@@ -370,6 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 		return handle_validity(vcpu);
 	case 0x28:
 		return handle_stop(vcpu);
+	case 0x2c:
+		return handle_operexc(vcpu);
 	case 0x38:
 		return handle_partial_execution(vcpu);
 	default:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6d8ec3ac9dd8ec..f0addece729e88 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -63,6 +63,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "exit_instruction", VCPU_STAT(exit_instruction) },
 	{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
 	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+	{ "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
 	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 916834d7a73a76..90d26a6aa52c09 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -412,6 +412,27 @@ TRACE_EVENT(kvm_s390_handle_stsi,
 			   __entry->addr)
 	);
 
+TRACE_EVENT(kvm_s390_handle_operexc,
+	    TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+	    TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(__u64, instruction)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->instruction = ((__u64)ipa << 48) |
+		    ((__u64)ipb << 16);
+		    ),
+
+	    VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)",
+			   __entry->instruction,
+			   __print_symbolic(icpt_insn_decoder(__entry->instruction),
+					    icpt_insn_codes))
+	);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */

From a2d57b35c0226102b1f2ffdc2f719fcc30c99bf5 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Mon, 23 May 2016 15:09:19 +0200
Subject: [PATCH 009/302] KVM: s390: Extend diag 204 fields

The new store hypervisor information instruction, which we are going
to introduce, needs previously unused fields in diag 204 structures.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/diag.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 197e303a76e96c..f4000cdb692116 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -97,6 +97,11 @@ enum diag204_format {
 	DIAG204_INFO_EXT = 0x00010000
 };
 
+enum diag204_cpu_flags {
+	DIAG204_CPU_ONLINE = 0x20,
+	DIAG204_CPU_CAPPED = 0x40,
+};
+
 struct diag204_info_blk_hdr {
 	__u8  npar;
 	__u8  flags;
@@ -136,10 +141,13 @@ struct diag204_x_part_hdr {
 	__u64 online_cs;
 	__u64 online_es;
 	__u8  upid;
-	char  reserved1[3];
+	__u8  reserved:3;
+	__u8  mtid:5;
+	char  reserved1[2];
 	__u32 group_mlu;
 	char  group_name[8];
-	char  reserved2[32];
+	char  hardware_group_name[8];
+	char  reserved2[24];
 } __packed;
 
 struct diag204_cpu_info {
@@ -168,7 +176,9 @@ struct diag204_x_cpu_info {
 	__u64 wait_time;
 	__u32 pma_weight;
 	__u32 polar_weight;
-	char  reserved3[40];
+	__u32 cpu_type_cap;
+	__u32 group_cpu_type_cap;
+	char  reserved3[32];
 } __packed;
 
 struct diag204_phys_hdr {
@@ -199,7 +209,8 @@ struct diag204_x_phys_cpu {
 	__u16 cpu_addr;
 	char  reserved1[2];
 	__u8  ctidx;
-	char  reserved2[3];
+	char  reserved2[1];
+	__u16 weight;
 	__u64 mgm_time;
 	char  reserved3[80];
 } __packed;

From 95ca2cb57985b07f5b136405f80a5106f5b06641 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Mon, 23 May 2016 15:11:58 +0200
Subject: [PATCH 010/302] KVM: s390: Add sthyi emulation

Store Hypervisor Information is an emulated z/VM instruction that
provides a guest with basic information about the layers it is running
on. This includes information about the cpu configuration of both the
machine and the lpar, as well as their names, machine model and
machine type. This information enables an application to determine the
maximum capacity of CPs and IFLs available to software.

The instruction is available whenever the facility bit 74 is set,
otherwise executing it results in an operation exception.

It is important to check the validity flags in the sections before
using data from any structure member. It is not guaranteed that all
members will be valid on all machines / machine configurations.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/diag.h     |  10 +
 arch/s390/include/asm/kvm_host.h |   2 +
 arch/s390/include/uapi/asm/sie.h |   1 +
 arch/s390/kvm/Makefile           |   2 +-
 arch/s390/kvm/intercept.c        |   4 +
 arch/s390/kvm/kvm-s390.c         |   6 +
 arch/s390/kvm/kvm-s390.h         |   3 +
 arch/s390/kvm/sthyi.c            | 460 +++++++++++++++++++++++++++++++
 arch/s390/kvm/trace.h            |  20 ++
 9 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 arch/s390/kvm/sthyi.c

diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index f4000cdb692116..82211998ccf740 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -215,6 +215,16 @@ struct diag204_x_phys_cpu {
 	char  reserved3[80];
 } __packed;
 
+struct diag204_x_part_block {
+	struct diag204_x_part_hdr hdr;
+	struct diag204_x_cpu_info cpus[];
+} __packed;
+
+struct diag204_x_phys_block {
+	struct diag204_x_phys_hdr hdr;
+	struct diag204_x_phys_cpu cpus[];
+} __packed;
+
 int diag204(unsigned long subcode, unsigned long size, void *addr);
 int diag224(void *ptr);
 #endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 093ea14109e2a1..7233b1c499646f 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -154,6 +154,7 @@ struct kvm_s390_sie_block {
 #define LCTL_CR14	0x0002
 	__u16   lctl;			/* 0x0044 */
 	__s16	icpua;			/* 0x0046 */
+#define ICTL_OPEREXC	0x80000000
 #define ICTL_PINT	0x20000000
 #define ICTL_LPSW	0x00400000
 #define ICTL_STCTL	0x00040000
@@ -279,6 +280,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_stfl;
 	u32 instruction_tprot;
 	u32 instruction_essa;
+	u32 instruction_sthyi;
 	u32 instruction_sigp_sense;
 	u32 instruction_sigp_sense_running;
 	u32 instruction_sigp_external_call;
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
index 8fb5d4a6dd25bc..3ac6343689394d 100644
--- a/arch/s390/include/uapi/asm/sie.h
+++ b/arch/s390/include/uapi/asm/sie.h
@@ -140,6 +140,7 @@
 	exit_code_ipa0(0xB2, 0x4c, "TAR"),	\
 	exit_code_ipa0(0xB2, 0x50, "CSP"),	\
 	exit_code_ipa0(0xB2, 0x54, "MVPG"),	\
+	exit_code_ipa0(0xB2, 0x56, "STHYI"),	\
 	exit_code_ipa0(0xB2, 0x58, "BSG"),	\
 	exit_code_ipa0(0xB2, 0x5a, "BSA"),	\
 	exit_code_ipa0(0xB2, 0x5f, "CHSC"),	\
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index d42fa38c242921..82e73e2b953d17 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o
 
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 09c13db1416fdc..9359f65c8634ba 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -355,6 +355,10 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 	trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
 				      vcpu->arch.sie_block->ipb);
 
+	if (vcpu->arch.sie_block->ipa == 0xb256 &&
+	    test_kvm_facility(vcpu->kvm, 74))
+		return handle_sthyi(vcpu);
+
 	return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f0addece729e88..1c10254119b331 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -94,6 +94,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
+	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
 	{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
 	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -1189,6 +1190,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 
+	set_kvm_facility(kvm->arch.model.fac_mask, 74);
+	set_kvm_facility(kvm->arch.model.fac_list, 74);
+
 	kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
 	kvm->arch.model.ibc = sclp.ibc & 0x0fff;
 
@@ -1679,6 +1683,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
 	vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+	if (test_kvm_facility(vcpu->kvm, 74))
+		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
 
 	if (vcpu->kvm->arch.use_cmma) {
 		rc = kvm_s390_vcpu_setup_cmma(vcpu);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8621ab00ec8e19..c5ec4d31e5e309 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -250,6 +250,9 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 
+/* implemented in sthyi.c */
+int handle_sthyi(struct kvm_vcpu *vcpu);
+
 /* implemented in kvm-s390.c */
 void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
new file mode 100644
index 00000000000000..894d5626f18d58
--- /dev/null
+++ b/arch/s390/kvm/sthyi.c
@@ -0,0 +1,460 @@
+/*
+ * store hypervisor information instruction emulation functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+
+#include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/sclp.h>
+#include <asm/diag.h>
+#include <asm/sysinfo.h>
+#include <asm/ebcdic.h>
+
+#include "kvm-s390.h"
+#include "gaccess.h"
+#include "trace.h"
+
+#define DED_WEIGHT 0xffff
+/*
+ * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string
+ * as they are justified with spaces.
+ */
+#define CP  0xc3d7404040404040UL
+#define IFL 0xc9c6d34040404040UL
+
+enum hdr_flags {
+	HDR_NOT_LPAR   = 0x10,
+	HDR_STACK_INCM = 0x20,
+	HDR_STSI_UNAV  = 0x40,
+	HDR_PERF_UNAV  = 0x80,
+};
+
+enum mac_validity {
+	MAC_NAME_VLD = 0x20,
+	MAC_ID_VLD   = 0x40,
+	MAC_CNT_VLD  = 0x80,
+};
+
+enum par_flag {
+	PAR_MT_EN = 0x80,
+};
+
+enum par_validity {
+	PAR_GRP_VLD  = 0x08,
+	PAR_ID_VLD   = 0x10,
+	PAR_ABS_VLD  = 0x20,
+	PAR_WGHT_VLD = 0x40,
+	PAR_PCNT_VLD  = 0x80,
+};
+
+struct hdr_sctn {
+	u8 infhflg1;
+	u8 infhflg2; /* reserved */
+	u8 infhval1; /* reserved */
+	u8 infhval2; /* reserved */
+	u8 reserved[3];
+	u8 infhygct;
+	u16 infhtotl;
+	u16 infhdln;
+	u16 infmoff;
+	u16 infmlen;
+	u16 infpoff;
+	u16 infplen;
+	u16 infhoff1;
+	u16 infhlen1;
+	u16 infgoff1;
+	u16 infglen1;
+	u16 infhoff2;
+	u16 infhlen2;
+	u16 infgoff2;
+	u16 infglen2;
+	u16 infhoff3;
+	u16 infhlen3;
+	u16 infgoff3;
+	u16 infglen3;
+	u8 reserved2[4];
+} __packed;
+
+struct mac_sctn {
+	u8 infmflg1; /* reserved */
+	u8 infmflg2; /* reserved */
+	u8 infmval1;
+	u8 infmval2; /* reserved */
+	u16 infmscps;
+	u16 infmdcps;
+	u16 infmsifl;
+	u16 infmdifl;
+	char infmname[8];
+	char infmtype[4];
+	char infmmanu[16];
+	char infmseq[16];
+	char infmpman[4];
+	u8 reserved[4];
+} __packed;
+
+struct par_sctn {
+	u8 infpflg1;
+	u8 infpflg2; /* reserved */
+	u8 infpval1;
+	u8 infpval2; /* reserved */
+	u16 infppnum;
+	u16 infpscps;
+	u16 infpdcps;
+	u16 infpsifl;
+	u16 infpdifl;
+	u16 reserved;
+	char infppnam[8];
+	u32 infpwbcp;
+	u32 infpabcp;
+	u32 infpwbif;
+	u32 infpabif;
+	char infplgnm[8];
+	u32 infplgcp;
+	u32 infplgif;
+} __packed;
+
+struct sthyi_sctns {
+	struct hdr_sctn hdr;
+	struct mac_sctn mac;
+	struct par_sctn par;
+} __packed;
+
+struct cpu_inf {
+	u64 lpar_cap;
+	u64 lpar_grp_cap;
+	u64 lpar_weight;
+	u64 all_weight;
+	int cpu_num_ded;
+	int cpu_num_shd;
+};
+
+struct lpar_cpu_inf {
+	struct cpu_inf cp;
+	struct cpu_inf ifl;
+};
+
+static inline u64 cpu_id(u8 ctidx, void *diag224_buf)
+{
+	return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN));
+}
+
+/*
+ * Scales the cpu capping from the lpar range to the one expected in
+ * sthyi data.
+ *
+ * diag204 reports a cap in hundredths of processor units.
+ * z/VM's range for one core is 0 - 0x10000.
+ */
+static u32 scale_cap(u32 in)
+{
+	return (0x10000 * in) / 100;
+}
+
+static void fill_hdr(struct sthyi_sctns *sctns)
+{
+	sctns->hdr.infhdln = sizeof(sctns->hdr);
+	sctns->hdr.infmoff = sizeof(sctns->hdr);
+	sctns->hdr.infmlen = sizeof(sctns->mac);
+	sctns->hdr.infplen = sizeof(sctns->par);
+	sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen;
+	sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen;
+}
+
+static void fill_stsi_mac(struct sthyi_sctns *sctns,
+			  struct sysinfo_1_1_1 *sysinfo)
+{
+	if (stsi(sysinfo, 1, 1, 1))
+		return;
+
+	sclp_ocf_cpc_name_copy(sctns->mac.infmname);
+
+	memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype));
+	memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu));
+	memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman));
+	memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq));
+
+	sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD;
+}
+
+static void fill_stsi_par(struct sthyi_sctns *sctns,
+			  struct sysinfo_2_2_2 *sysinfo)
+{
+	if (stsi(sysinfo, 2, 2, 2))
+		return;
+
+	sctns->par.infppnum = sysinfo->lpar_number;
+	memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam));
+
+	sctns->par.infpval1 |= PAR_ID_VLD;
+}
+
+static void fill_stsi(struct sthyi_sctns *sctns)
+{
+	void *sysinfo;
+
+	/* Errors are handled through the validity bits in the response. */
+	sysinfo = (void *)__get_free_page(GFP_KERNEL);
+	if (!sysinfo)
+		return;
+
+	fill_stsi_mac(sctns, sysinfo);
+	fill_stsi_par(sctns, sysinfo);
+
+	free_pages((unsigned long)sysinfo, 0);
+}
+
+static void fill_diag_mac(struct sthyi_sctns *sctns,
+			  struct diag204_x_phys_block *block,
+			  void *diag224_buf)
+{
+	int i;
+
+	for (i = 0; i < block->hdr.cpus; i++) {
+		switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+		case CP:
+			if (block->cpus[i].weight == DED_WEIGHT)
+				sctns->mac.infmdcps++;
+			else
+				sctns->mac.infmscps++;
+			break;
+		case IFL:
+			if (block->cpus[i].weight == DED_WEIGHT)
+				sctns->mac.infmdifl++;
+			else
+				sctns->mac.infmsifl++;
+			break;
+		}
+	}
+	sctns->mac.infmval1 |= MAC_CNT_VLD;
+}
+
+/* Returns a pointer to the the next partition block. */
+static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
+						 bool this_lpar,
+						 void *diag224_buf,
+						 struct diag204_x_part_block *block)
+{
+	int i, capped = 0, weight_cp = 0, weight_ifl = 0;
+	struct cpu_inf *cpu_inf;
+
+	for (i = 0; i < block->hdr.rcpus; i++) {
+		if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE))
+			continue;
+
+		switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+		case CP:
+			cpu_inf = &part_inf->cp;
+			if (block->cpus[i].cur_weight < DED_WEIGHT)
+				weight_cp |= block->cpus[i].cur_weight;
+			break;
+		case IFL:
+			cpu_inf = &part_inf->ifl;
+			if (block->cpus[i].cur_weight < DED_WEIGHT)
+				weight_ifl |= block->cpus[i].cur_weight;
+			break;
+		default:
+			continue;
+		}
+
+		if (!this_lpar)
+			continue;
+
+		capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED;
+		cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap;
+		cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap;
+
+		if (block->cpus[i].weight == DED_WEIGHT)
+			cpu_inf->cpu_num_ded += 1;
+		else
+			cpu_inf->cpu_num_shd += 1;
+	}
+
+	if (this_lpar && capped) {
+		part_inf->cp.lpar_weight = weight_cp;
+		part_inf->ifl.lpar_weight = weight_ifl;
+	}
+	part_inf->cp.all_weight += weight_cp;
+	part_inf->ifl.all_weight += weight_ifl;
+	return (struct diag204_x_part_block *)&block->cpus[i];
+}
+
+static void fill_diag(struct sthyi_sctns *sctns)
+{
+	int i, r, pages;
+	bool this_lpar;
+	void *diag204_buf;
+	void *diag224_buf = NULL;
+	struct diag204_x_info_blk_hdr *ti_hdr;
+	struct diag204_x_part_block *part_block;
+	struct diag204_x_phys_block *phys_block;
+	struct lpar_cpu_inf lpar_inf = {};
+
+	/* Errors are handled through the validity bits in the response. */
+	pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+			(unsigned long)DIAG204_INFO_EXT, 0, NULL);
+	if (pages <= 0)
+		return;
+
+	diag204_buf = vmalloc(PAGE_SIZE * pages);
+	if (!diag204_buf)
+		return;
+
+	r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
+		    (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
+	if (r < 0)
+		goto out;
+
+	diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+	if (!diag224_buf || diag224(diag224_buf))
+		goto out;
+
+	ti_hdr = diag204_buf;
+	part_block = diag204_buf + sizeof(*ti_hdr);
+
+	for (i = 0; i < ti_hdr->npar; i++) {
+		/*
+		 * For the calling lpar we also need to get the cpu
+		 * caps and weights. The time information block header
+		 * specifies the offset to the partition block of the
+		 * caller lpar, so we know when we process its data.
+		 */
+		this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part;
+		part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf,
+					  part_block);
+	}
+
+	phys_block = (struct diag204_x_phys_block *)part_block;
+	part_block = diag204_buf + ti_hdr->this_part;
+	if (part_block->hdr.mtid)
+		sctns->par.infpflg1 = PAR_MT_EN;
+
+	sctns->par.infpval1 |= PAR_GRP_VLD;
+	sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap);
+	sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap);
+	memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name,
+	       sizeof(sctns->par.infplgnm));
+
+	sctns->par.infpscps = lpar_inf.cp.cpu_num_shd;
+	sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded;
+	sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd;
+	sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded;
+	sctns->par.infpval1 |= PAR_PCNT_VLD;
+
+	sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap);
+	sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap);
+	sctns->par.infpval1 |= PAR_ABS_VLD;
+
+	/*
+	 * Everything below needs global performance data to be
+	 * meaningful.
+	 */
+	if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) {
+		sctns->hdr.infhflg1 |= HDR_PERF_UNAV;
+		goto out;
+	}
+
+	fill_diag_mac(sctns, phys_block, diag224_buf);
+
+	if (lpar_inf.cp.lpar_weight) {
+		sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 *
+			lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight;
+	}
+
+	if (lpar_inf.ifl.lpar_weight) {
+		sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 *
+			lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight;
+	}
+	sctns->par.infpval1 |= PAR_WGHT_VLD;
+
+out:
+	kfree(diag224_buf);
+	vfree(diag204_buf);
+}
+
+static int sthyi(u64 vaddr)
+{
+	register u64 code asm("0") = 0;
+	register u64 addr asm("2") = vaddr;
+	int cc;
+
+	asm volatile(
+		".insn   rre,0xB2560000,%[code],%[addr]\n"
+		"ipm     %[cc]\n"
+		"srl     %[cc],28\n"
+		: [cc] "=d" (cc)
+		: [code] "d" (code), [addr] "a" (addr)
+		: "memory", "cc");
+	return cc;
+}
+
+int handle_sthyi(struct kvm_vcpu *vcpu)
+{
+	int reg1, reg2, r = 0;
+	u64 code, addr, cc = 0;
+	struct sthyi_sctns *sctns = NULL;
+
+	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+	code = vcpu->run->s.regs.gprs[reg1];
+	addr = vcpu->run->s.regs.gprs[reg2];
+
+	vcpu->stat.instruction_sthyi++;
+	VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
+	trace_kvm_s390_handle_sthyi(vcpu, code, addr);
+
+	if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	if (code & 0xffff) {
+		cc = 3;
+		goto out;
+	}
+
+	/*
+	 * If the page has not yet been faulted in, we want to do that
+	 * now and not after all the expensive calculations.
+	 */
+	r = write_guest(vcpu, addr, reg2, &cc, 1);
+	if (r)
+		return kvm_s390_inject_prog_cond(vcpu, r);
+
+	sctns = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!sctns)
+		return -ENOMEM;
+
+	/*
+	 * If we are a guest, we don't want to emulate an emulated
+	 * instruction. We ask the hypervisor to provide the data.
+	 */
+	if (test_facility(74)) {
+		cc = sthyi((u64)sctns);
+		goto out;
+	}
+
+	fill_hdr(sctns);
+	fill_stsi(sctns);
+	fill_diag(sctns);
+
+out:
+	if (!cc) {
+		r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+		if (r) {
+			free_page((unsigned long)sctns);
+			return kvm_s390_inject_prog_cond(vcpu, r);
+		}
+	}
+
+	free_page((unsigned long)sctns);
+	vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0;
+	kvm_s390_set_psw_cc(vcpu, cc);
+	return r;
+}
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 90d26a6aa52c09..a429ef9b0d3041 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -433,6 +433,26 @@ TRACE_EVENT(kvm_s390_handle_operexc,
 					    icpt_insn_codes))
 	);
 
+TRACE_EVENT(kvm_s390_handle_sthyi,
+	    TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr),
+	    TP_ARGS(VCPU_ARGS_COMMON, code, addr),
+
+	    TP_STRUCT__entry(
+		    VCPU_FIELD_COMMON
+		    __field(u64, code)
+		    __field(u64, addr)
+		    ),
+
+	    TP_fast_assign(
+		    VCPU_ASSIGN_COMMON
+		    __entry->code = code;
+		    __entry->addr = addr;
+		    ),
+
+	    VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx",
+			   __entry->code, __entry->addr)
+	);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */

From 7d0a5e62411a9223512c6af2e4c08a2d7c00fa2e Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Tue, 10 May 2016 15:03:42 +0200
Subject: [PATCH 011/302] KVM: s390: Limit sthyi execution

Store hypervisor information is a valid instruction not only in
supervisor state but also in problem state, i.e. the guest's
userspace. Its execution is not only computational and memory
intensive, but also has to get hold of the ipte lock to write to the
guest's memory.

This lock is not intended to be held often and long, especially not
from the untrusted guest userspace. Therefore we apply rate limiting
of sthyi executions per VM.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Acked-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  1 +
 arch/s390/kvm/kvm-s390.c         |  2 ++
 arch/s390/kvm/sthyi.c            | 11 +++++++++++
 3 files changed, 14 insertions(+)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 7233b1c499646f..bcc20dc91ea8aa 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -652,6 +652,7 @@ struct kvm_arch{
 	wait_queue_head_t ipte_wq;
 	int ipte_lock_count;
 	struct mutex ipte_mutex;
+	struct ratelimit_state sthyi_limit;
 	spinlock_t start_stop_lock;
 	struct sie_page2 *sie_page2;
 	struct kvm_s390_cpu_model model;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1c10254119b331..44297ff53b4431 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1151,6 +1151,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	rc = -ENOMEM;
 
+	ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+
 	kvm->arch.use_esca = 0; /* start with basic SCA */
 	rwlock_init(&kvm->arch.sca_lock);
 	kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
index 894d5626f18d58..bd98b7d252004d 100644
--- a/arch/s390/kvm/sthyi.c
+++ b/arch/s390/kvm/sthyi.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
 
 #include <asm/kvm_host.h>
 #include <asm/asm-offsets.h>
@@ -403,6 +404,16 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
 	u64 code, addr, cc = 0;
 	struct sthyi_sctns *sctns = NULL;
 
+	/*
+	 * STHYI requires extensive locking in the higher hypervisors
+	 * and is very computational/memory expensive. Therefore we
+	 * ratelimit the executions per VM.
+	 */
+	if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) {
+		kvm_s390_retry_instr(vcpu);
+		return 0;
+	}
+
 	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
 	code = vcpu->run->s.regs.gprs[reg1];
 	addr = vcpu->run->s.regs.gprs[reg2];

From c1778e515712dae0575657fe6c9511ffcb28a7e0 Mon Sep 17 00:00:00 2001
From: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Date: Fri, 6 May 2016 15:47:19 +0300
Subject: [PATCH 012/302] KVM: s390: Add mnemonic print to
 kvm_s390_intercept_prog

We have a table of mnemonic names for intercepted program
interruptions, let's print readable name of the interruption in the
kvm_s390_intercept_prog trace event.

Signed-off-by: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/trace.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index a429ef9b0d3041..1c4586b367a443 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog,
 		    __entry->code = code;
 		    ),
 
-	    VCPU_TP_PRINTK("intercepted program interruption %04x",
-			   __entry->code)
+	    VCPU_TP_PRINTK("intercepted program interruption %04x (%s)",
+			   __entry->code,
+			   __print_symbolic(__entry->code,
+					    icpt_prog_codes))
 	);
 
 /*

From 15c9705f0c8af2d19dede9866aec364746b269ef Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 19 Mar 2015 17:36:43 +0100
Subject: [PATCH 013/302] KVM: s390: interface to query and configure cpu
 features

For now, we only have an interface to query and configure facilities
indicated via STFL(E). However, we also have features indicated via
SCLP, that have to be indicated to the guest by user space and usually
require KVM support.

This patch allows user space to query and configure available cpu features
for the guest.

Please note that disabling a feature doesn't necessarily mean that it is
completely disabled (e.g. ESOP is mostly handled by the SIE). We will try
our best to disable it.

Most features (e.g. SCLP) can't directly be forwarded, as most of them need
in addition to hardware support, support in KVM. As we later on want to
turn these features in KVM explicitly on/off (to simulate different
behavior), we have to filter all features provided by the hardware and
make them configurable.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/vm.txt | 27 ++++++++++
 arch/s390/include/asm/kvm_host.h         |  2 +
 arch/s390/include/uapi/asm/kvm.h         |  8 +++
 arch/s390/kvm/kvm-s390.c                 | 63 ++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.h                 |  6 +++
 5 files changed, 106 insertions(+)

diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index a9ea8774a45feb..0ed6808b9965d1 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -85,6 +85,33 @@ Returns:    -EBUSY in case 1 or more vcpus are already activated (only in write
 	    -ENOMEM if not enough memory is available to process the ioctl
 	    0 in case of success
 
+2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o)
+
+Allows user space to retrieve available cpu features. A feature is available if
+provided by the hardware and supported by kvm. In theory, cpu features could
+even be completely emulated by kvm.
+
+struct kvm_s390_vm_cpu_feat {
+        __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering
+};
+
+Parameters: address of a buffer to load the feature list from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+	    0 in case of success.
+
+2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w)
+
+Allows user space to retrieve or change enabled cpu features for all VCPUs of a
+VM. Features that are not available cannot be enabled.
+
+See 2.3. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the feature list from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+	    -EINVAL if a cpu feature that is not available is to be enabled.
+	    -EBUSY if at least one VCPU has already been defined.
+	    0 in case of success.
+
 3. GROUP: KVM_S390_VM_TOD
 Architectures: s390
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index bcc20dc91ea8aa..b2a83a0ce42cc6 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -658,6 +658,8 @@ struct kvm_arch{
 	struct kvm_s390_cpu_model model;
 	struct kvm_s390_crypto crypto;
 	u64 epoch;
+	/* subset of available cpu features enabled by user space */
+	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 };
 
 #define KVM_HVA_ERR_BAD		(-1UL)
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 3b8e99ef9d58d4..a8559f265e260a 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -93,6 +93,14 @@ struct kvm_s390_vm_cpu_machine {
 	__u64 fac_list[256];
 };
 
+#define KVM_S390_VM_CPU_PROCESSOR_FEAT	2
+#define KVM_S390_VM_CPU_MACHINE_FEAT	3
+
+#define KVM_S390_VM_CPU_FEAT_NR_BITS	1024
+struct kvm_s390_vm_cpu_feat {
+	__u64 feat[16];
+};
+
 /* kvm attributes for crypto */
 #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW	0
 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW	1
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 44297ff53b4431..6960468f28ad36 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
+#include <linux/bitmap.h>
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/etr.h>
@@ -132,6 +133,9 @@ unsigned long kvm_s390_fac_list_mask_size(void)
 	return ARRAY_SIZE(kvm_s390_fac_list_mask);
 }
 
+/* available cpu features supported by kvm */
+static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+
 static struct gmap_notifier gmap_notifier;
 debug_info_t *kvm_s390_dbf;
 
@@ -677,6 +681,29 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 	return ret;
 }
 
+static int kvm_s390_set_processor_feat(struct kvm *kvm,
+				       struct kvm_device_attr *attr)
+{
+	struct kvm_s390_vm_cpu_feat data;
+	int ret = -EBUSY;
+
+	if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+		return -EFAULT;
+	if (!bitmap_subset((unsigned long *) data.feat,
+			   kvm_s390_available_cpu_feat,
+			   KVM_S390_VM_CPU_FEAT_NR_BITS))
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+	if (!atomic_read(&kvm->online_vcpus)) {
+		bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+			    KVM_S390_VM_CPU_FEAT_NR_BITS);
+		ret = 0;
+	}
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
 static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret = -ENXIO;
@@ -685,6 +712,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_PROCESSOR:
 		ret = kvm_s390_set_processor(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+		ret = kvm_s390_set_processor_feat(kvm, attr);
+		break;
 	}
 	return ret;
 }
@@ -733,6 +763,31 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
 	return ret;
 }
 
+static int kvm_s390_get_processor_feat(struct kvm *kvm,
+				       struct kvm_device_attr *attr)
+{
+	struct kvm_s390_vm_cpu_feat data;
+
+	bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+		    KVM_S390_VM_CPU_FEAT_NR_BITS);
+	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+		return -EFAULT;
+	return 0;
+}
+
+static int kvm_s390_get_machine_feat(struct kvm *kvm,
+				     struct kvm_device_attr *attr)
+{
+	struct kvm_s390_vm_cpu_feat data;
+
+	bitmap_copy((unsigned long *) data.feat,
+		    kvm_s390_available_cpu_feat,
+		    KVM_S390_VM_CPU_FEAT_NR_BITS);
+	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+		return -EFAULT;
+	return 0;
+}
+
 static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret = -ENXIO;
@@ -744,6 +799,12 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_MACHINE:
 		ret = kvm_s390_get_machine(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+		ret = kvm_s390_get_processor_feat(kvm, attr);
+		break;
+	case KVM_S390_VM_CPU_MACHINE_FEAT:
+		ret = kvm_s390_get_machine_feat(kvm, attr);
+		break;
 	}
 	return ret;
 }
@@ -827,6 +888,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 		switch (attr->attr) {
 		case KVM_S390_VM_CPU_PROCESSOR:
 		case KVM_S390_VM_CPU_MACHINE:
+		case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+		case KVM_S390_VM_CPU_MACHINE_FEAT:
 			ret = 0;
 			break;
 		default:
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index c5ec4d31e5e309..52aa47e112d801 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
 	return 0;
 }
 
+static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr)
+{
+	WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS);
+	return test_bit_inv(nr, kvm->arch.cpu_feat);
+}
+
 /* are cpu states controlled by user space */
 static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 {

From 22be5a133169e855097936438417ab1b672ad43f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 21 Jan 2016 13:22:54 +0100
Subject: [PATCH 014/302] KVM: s390: forward ESOP if available

ESOP guarantees that during a protection exception, bit 61 of real location
168-175 will only be set to 1 if it was because of ALCP or DATP. If the
exception is due to LAP or KCP, the bit will always be set to 0.

The old SOP definition allowed bit 61 to be unpredictable in case of LAP
or KCP in some conditions. So ESOP replaces this unpredictability by
a guarantee.

Therefore, we can directly forward ESOP if it is available on our machine.
We don't have to do anything when ESOP is disabled - the guest will simply
expect unpredictable values. Our guest access functions are already
handling ESOP properly.

Please note that future functionality in KVM will require knowledge about
ESOP being enabled for a guest or not.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h |  1 +
 arch/s390/kvm/kvm-s390.c         | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index a8559f265e260a..789c4e27e2941a 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -97,6 +97,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_MACHINE_FEAT	3
 
 #define KVM_S390_VM_CPU_FEAT_NR_BITS	1024
+#define KVM_S390_VM_CPU_FEAT_ESOP	0
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6960468f28ad36..2b5c14da32273a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -193,6 +193,17 @@ void kvm_arch_hardware_unsetup(void)
 					 &kvm_clock_notifier);
 }
 
+static void allow_cpu_feat(unsigned long nr)
+{
+	set_bit_inv(nr, kvm_s390_available_cpu_feat);
+}
+
+static void kvm_s390_cpu_feat_init(void)
+{
+	if (MACHINE_HAS_ESOP)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+}
+
 int kvm_arch_init(void *opaque)
 {
 	kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@ -204,6 +215,8 @@ int kvm_arch_init(void *opaque)
 		return -ENOMEM;
 	}
 
+	kvm_s390_cpu_feat_init();
+
 	/* Register floating interrupt controller interface. */
 	return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
 }

From 6167375b558196fdedd38e9867f7bb30ff4dda50 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 31 May 2016 19:44:10 +0200
Subject: [PATCH 015/302] KVM: s390: gaccess: store guest address on ALC prot
 exceptions

Let's pass the effective guest address to get_vcpu_asce(), so we
can properly set the guest address in case we inject an ALC protection
exception.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 66938d283b7709..c0da9e9d4490fe 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -477,7 +477,7 @@ enum {
 };
 
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
-			 ar_t ar, enum gacc_mode mode)
+			 unsigned long ga, ar_t ar, enum gacc_mode mode)
 {
 	int rc;
 	struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
@@ -519,6 +519,7 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 			vcpu->arch.pgm.exc_access_id = ar;
 			break;
 		case PGM_PROTECTION:
+			tec_bits->addr = ga >> PAGE_SHIFT;
 			tec_bits->b60 = 1;
 			tec_bits->b61 = 1;
 			break;
@@ -783,7 +784,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 
 	if (!len)
 		return 0;
-	rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+	ga = kvm_s390_logical_to_effective(vcpu, ga);
+	rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode);
 	if (rc)
 		return rc;
 	nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -854,7 +856,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
 
 	gva = kvm_s390_logical_to_effective(vcpu, gva);
 	tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-	rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+	rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
 	tec->addr = gva >> PAGE_SHIFT;
 	if (rc)
 		return rc;

From d03193de30e6d99770930c6fbf14f0d5dd5cb2f0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 31 May 2016 19:56:46 +0200
Subject: [PATCH 016/302] KVM: s390: gaccess: function for preparing
 translation exceptions

Let's provide a function trans_exc() that can be used for handling
preparation of translation exceptions on a central basis. We will use
that function to replace existing code in gaccess.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 62 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index c0da9e9d4490fe..b6ccb26bc3c1c8 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -476,6 +476,68 @@ enum {
 	FSI_FETCH   = 2  /* Exception was due to fetch operation */
 };
 
+enum prot_type {
+	PROT_TYPE_LA   = 0,
+	PROT_TYPE_KEYC = 1,
+	PROT_TYPE_ALC  = 2,
+	PROT_TYPE_DAT  = 3,
+};
+
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
+		     ar_t ar, enum gacc_mode mode, enum prot_type prot)
+{
+	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
+	struct trans_exc_code_bits *tec;
+
+	memset(pgm, 0, sizeof(*pgm));
+	pgm->code = code;
+	tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+
+	switch (code) {
+	case PGM_ASCE_TYPE:
+	case PGM_PAGE_TRANSLATION:
+	case PGM_REGION_FIRST_TRANS:
+	case PGM_REGION_SECOND_TRANS:
+	case PGM_REGION_THIRD_TRANS:
+	case PGM_SEGMENT_TRANSLATION:
+		/*
+		 * op_access_id only applies to MOVE_PAGE -> set bit 61
+		 * exc_access_id has to be set to 0 for some instructions. Both
+		 * cases have to be handled by the caller. We can always store
+		 * exc_access_id, as it is undefined for non-ar cases.
+		 */
+		tec->addr = gva >> PAGE_SHIFT;
+		tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+		tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+		/* FALL THROUGH */
+	case PGM_ALEN_TRANSLATION:
+	case PGM_ALE_SEQUENCE:
+	case PGM_ASTE_VALIDITY:
+	case PGM_ASTE_SEQUENCE:
+	case PGM_EXTENDED_AUTHORITY:
+		pgm->exc_access_id = ar;
+		break;
+	case PGM_PROTECTION:
+		switch (prot) {
+		case PROT_TYPE_ALC:
+			tec->b60 = 1;
+			/* FALL THROUGH */
+		case PROT_TYPE_DAT:
+			tec->b61 = 1;
+			tec->addr = gva >> PAGE_SHIFT;
+			tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+			tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+			/* exc_access_id is undefined for most cases */
+			pgm->exc_access_id = ar;
+			break;
+		default: /* LA and KEYC set b61 to 0, other params undefined */
+			break;
+		}
+		break;
+	}
+	return code;
+}
+
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 			 unsigned long ga, ar_t ar, enum gacc_mode mode)
 {

From 3e3c67f6a327852375247c98b0d153c44e460216 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 31 May 2016 20:00:33 +0200
Subject: [PATCH 017/302] KVM: s390: gaccess: convert
 kvm_s390_check_low_addr_prot_real()

Let's use our new function for preparing translation exceptions.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index b6ccb26bc3c1c8..61dc45ef50b9d0 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -979,20 +979,9 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
  */
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 {
-	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
-	psw_t *psw = &vcpu->arch.sie_block->gpsw;
-	struct trans_exc_code_bits *tec_bits;
 	union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
 
 	if (!ctlreg0.lap || !is_low_address(gra))
 		return 0;
-
-	memset(pgm, 0, sizeof(*pgm));
-	tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-	tec_bits->fsi = FSI_STORE;
-	tec_bits->as = psw_bits(*psw).as;
-	tec_bits->addr = gra >> PAGE_SHIFT;
-	pgm->code = PGM_PROTECTION;
-
-	return pgm->code;
+	return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
 }

From fbcb7d5157718645cc198c6be6b435ab326c1892 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 31 May 2016 20:06:55 +0200
Subject: [PATCH 018/302] KVM: s390: gaccess: convert guest_translate_address()

Let's use our new function for preparing translation exceptions.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 61dc45ef50b9d0..ae9f9e8e063cdb 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -910,37 +910,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
 			    unsigned long *gpa, enum gacc_mode mode)
 {
-	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
-	struct trans_exc_code_bits *tec;
 	union asce asce;
 	int rc;
 
 	gva = kvm_s390_logical_to_effective(vcpu, gva);
-	tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
 	rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
-	tec->addr = gva >> PAGE_SHIFT;
 	if (rc)
 		return rc;
 	if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
-		if (mode == GACC_STORE) {
-			rc = pgm->code = PGM_PROTECTION;
-			return rc;
-		}
+		if (mode == GACC_STORE)
+			return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
+					 mode, PROT_TYPE_LA);
 	}
 
 	if (psw_bits(*psw).t && !asce.r) {	/* Use DAT? */
 		rc = guest_translate(vcpu, gva, gpa, asce, mode);
-		if (rc > 0) {
-			if (rc == PGM_PROTECTION)
-				tec->b61 = 1;
-			pgm->code = rc;
-		}
+		if (rc > 0)
+			return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
 	} else {
-		rc = 0;
 		*gpa = kvm_s390_real_to_abs(vcpu, gva);
 		if (kvm_is_error_gpa(vcpu->kvm, *gpa))
-			rc = pgm->code = PGM_ADDRESSING;
+			return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
 	}
 
 	return rc;

From cde0dcfb5df1dbcd90a8e73130a6b7091bdb493a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 31 May 2016 20:13:35 +0200
Subject: [PATCH 019/302] KVM: s390: gaccess: convert guest_page_range()

Let's use our new function for preparing translation exceptions. As we will
need the correct ar, let's pass that to guest_page_range().

This will also make sure that the guest address is stored in the tec
for applicable excptions.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index ae9f9e8e063cdb..ec6c91e85dbe92 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -792,40 +792,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
 			    unsigned long *pages, unsigned long nr_pages,
 			    const union asce asce, enum gacc_mode mode)
 {
-	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
-	struct trans_exc_code_bits *tec_bits;
-	int lap_enabled, rc;
+	int lap_enabled, rc = 0;
 
-	tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
 	lap_enabled = low_address_protection_enabled(vcpu, asce);
 	while (nr_pages) {
 		ga = kvm_s390_logical_to_effective(vcpu, ga);
-		tec_bits->addr = ga >> PAGE_SHIFT;
-		if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
-			pgm->code = PGM_PROTECTION;
-			return pgm->code;
-		}
+		if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
+			return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
+					 PROT_TYPE_LA);
 		ga &= PAGE_MASK;
 		if (psw_bits(*psw).t) {
 			rc = guest_translate(vcpu, ga, pages, asce, mode);
 			if (rc < 0)
 				return rc;
-			if (rc == PGM_PROTECTION)
-				tec_bits->b61 = 1;
-			if (rc)
-				pgm->code = rc;
 		} else {
 			*pages = kvm_s390_real_to_abs(vcpu, ga);
 			if (kvm_is_error_gpa(vcpu->kvm, *pages))
-				pgm->code = PGM_ADDRESSING;
+				rc = PGM_ADDRESSING;
 		}
-		if (pgm->code)
-			return pgm->code;
+		if (rc)
+			return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
 		ga += PAGE_SIZE;
 		pages++;
 		nr_pages--;
@@ -859,7 +850,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 	need_ipte_lock = psw_bits(*psw).t && !asce.r;
 	if (need_ipte_lock)
 		ipte_lock(vcpu);
-	rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
+	rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
 	for (idx = 0; idx < nr_pages && !rc; idx++) {
 		gpa = *(pages + idx) + (ga & ~PAGE_MASK);
 		_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);

From bcfa01d787278476f3e79530d03df9b3f52e6e59 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 31 May 2016 20:21:03 +0200
Subject: [PATCH 020/302] KVM: s390: gaccess: convert get_vcpu_asce()

Let's use our new function for preparing translation exceptions.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index ec6c91e85dbe92..8e245e764c210a 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -543,13 +543,6 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 {
 	int rc;
 	struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
-	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
-	struct trans_exc_code_bits *tec_bits;
-
-	memset(pgm, 0, sizeof(*pgm));
-	tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-	tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
-	tec_bits->as = psw.as;
 
 	if (!psw.t) {
 		asce->val = 0;
@@ -572,22 +565,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 		return 0;
 	case PSW_AS_ACCREG:
 		rc = ar_translation(vcpu, asce, ar, mode);
-		switch (rc) {
-		case PGM_ALEN_TRANSLATION:
-		case PGM_ALE_SEQUENCE:
-		case PGM_ASTE_VALIDITY:
-		case PGM_ASTE_SEQUENCE:
-		case PGM_EXTENDED_AUTHORITY:
-			vcpu->arch.pgm.exc_access_id = ar;
-			break;
-		case PGM_PROTECTION:
-			tec_bits->addr = ga >> PAGE_SHIFT;
-			tec_bits->b60 = 1;
-			tec_bits->b61 = 1;
-			break;
-		}
 		if (rc > 0)
-			pgm->code = rc;
+			return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
 		return rc;
 	}
 	return 0;

From 1afd43e0fbba4a92effc22977e3a7e64213ee860 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 18 May 2016 15:59:06 +0200
Subject: [PATCH 021/302] s390/crypto: allow to query all known cpacf functions

KVM will have to query these functions, let's add at least the query
capabilities.

PCKMO has RRE format, as bit 16-31 are ignored, we can still use the
existing function. As PCKMO won't touch the cc, let's force it to 0
upfront.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Acked-by: Ingo Tuchscherer <ingo.tuchscherer@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/cpacf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 1a82cf26ee11ba..d28621de8e0b14 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -20,6 +20,9 @@
 #define CPACF_KMC		0xb92f		/* MSA	*/
 #define CPACF_KIMD		0xb93e		/* MSA	*/
 #define CPACF_KLMD		0xb93f		/* MSA	*/
+#define CPACF_PCKMO		0xb928		/* MSA3 */
+#define CPACF_KMF		0xb92a		/* MSA4 */
+#define CPACF_KMO		0xb92b		/* MSA4 */
 #define CPACF_PCC		0xb92c		/* MSA4 */
 #define CPACF_KMCTR		0xb92d		/* MSA4 */
 #define CPACF_PPNO		0xb93c		/* MSA5 */
@@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status)
 	register unsigned long r1 asm("1") = (unsigned long) status;
 
 	asm volatile(
+		"	spm 0\n" /* pckmo doesn't change the cc */
 		/* Parameter registers are ignored, but may not be 0 */
 		"0:	.insn	rrf,%[opc] << 16,2,2,2,0\n"
 		"	brc	1,0b\n"	/* handle partial completion */
@@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func)
 		if (!test_facility(17))	/* check for MSA */
 			return 0;
 		break;
+	case CPACF_PCKMO:
+		if (!test_facility(76))	/* check for MSA3 */
+			return 0;
+		break;
+	case CPACF_KMF:
+	case CPACF_KMO:
 	case CPACF_PCC:
 	case CPACF_KMCTR:
 		if (!test_facility(77))	/* check for MSA4 */

From 0a763c780b7cb830c250d00ead975778ab948f40 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 18 May 2016 16:03:47 +0200
Subject: [PATCH 022/302] KVM: s390: interface to query and configure cpu
 subfunctions

We have certain instructions that indicate available subfunctions via
a query subfunction (crypto functions and ptff), or via a test bit
function (plo).

By exposing these "subfunction blocks" to user space, we allow user space
to
1) query available subfunctions and make sure subfunctions won't get lost
   during migration - e.g. properly indicate them via a CPU model
2) change the subfunctions to be reported to the guest (even adding
   unavailable ones)

This mechanism works just like the way we indicate the stfl(e) list to
user space.

This way, user space could even emulate some subfunctions in QEMU in the
future. If this is ever applicable, we have to make sure later on, that
unsupported subfunctions result in an intercept to QEMU.

Please note that support to indicate them to the guest is still missing
and requires hardware support. Usually, the IBC takes already care of these
subfunctions for migration safety. QEMU should make sure to always set
these bits properly according to the machine generation to be emulated.

Available subfunctions are only valid in combination with STFLE bits
retrieved via KVM_S390_VM_CPU_MACHINE and enabled via
KVM_S390_VM_CPU_PROCESSOR. If the applicable bits are available, the
indicated subfunctions are guaranteed to be correct.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/vm.txt | 57 +++++++++++++++
 arch/s390/include/uapi/asm/kvm.h         | 20 ++++++
 arch/s390/kvm/kvm-s390.c                 | 89 ++++++++++++++++++++++++
 3 files changed, 166 insertions(+)

diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index 0ed6808b9965d1..8a458f42ded286 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -112,6 +112,63 @@ Returns:    -EFAULT if the given address is not accessible from kernel space.
 	    -EBUSY if at least one VCPU has already been defined.
 	    0 in case of success.
 
+2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o)
+
+Allows user space to retrieve available cpu subfunctions without any filtering
+done by a set IBC. These subfunctions are indicated to the guest VCPU via
+query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff.
+
+A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the
+STFL(E) bit introducing the affected instruction. If the affected instruction
+indicates subfunctions via a "query subfunction", the response block is
+contained in the returned struct. If the affected instruction
+indicates subfunctions via a "test bit" mechanism, the subfunction codes are
+contained in the returned struct in MSB 0 bit numbering.
+
+struct kvm_s390_vm_cpu_subfunc {
+       u8 plo[32];           # always valid (ESA/390 feature)
+       u8 ptff[16];          # valid with TOD-clock steering
+       u8 kmac[16];          # valid with Message-Security-Assist
+       u8 kmc[16];           # valid with Message-Security-Assist
+       u8 km[16];            # valid with Message-Security-Assist
+       u8 kimd[16];          # valid with Message-Security-Assist
+       u8 klmd[16];          # valid with Message-Security-Assist
+       u8 pckmo[16];         # valid with Message-Security-Assist-Extension 3
+       u8 kmctr[16];         # valid with Message-Security-Assist-Extension 4
+       u8 kmf[16];           # valid with Message-Security-Assist-Extension 4
+       u8 kmo[16];           # valid with Message-Security-Assist-Extension 4
+       u8 pcc[16];           # valid with Message-Security-Assist-Extension 4
+       u8 ppno[16];          # valid with Message-Security-Assist-Extension 5
+       u8 reserved[1824];    # reserved for future instructions
+};
+
+Parameters: address of a buffer to load the subfunction blocks from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+	    0 in case of success.
+
+2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w)
+
+Allows user space to retrieve or change cpu subfunctions to be indicated for
+all VCPUs of a VM. This attribute will only be available if kernel and
+hardware support are in place.
+
+The kernel uses the configured subfunction blocks for indication to
+the guest. A subfunction block will only be used if the associated STFL(E) bit
+has not been disabled by user space (so the instruction to be queried is
+actually available for the guest).
+
+As long as no data has been written, a read will fail. The IBC will be used
+to determine available subfunctions in this case, this will guarantee backward
+compatibility.
+
+See 2.5. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the subfunction blocks from.
+Returns:    -EFAULT if the given address is not accessible from kernel space.
+	    -EINVAL when reading, if there was no write yet.
+	    -EBUSY if at least one VCPU has already been defined.
+	    0 in case of success.
+
 3. GROUP: KVM_S390_VM_TOD
 Architectures: s390
 
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 789c4e27e2941a..f0818d70d73dc8 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -102,6 +102,26 @@ struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
 
+#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC	4
+#define KVM_S390_VM_CPU_MACHINE_SUBFUNC		5
+/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */
+struct kvm_s390_vm_cpu_subfunc {
+	__u8 plo[32];		/* always */
+	__u8 ptff[16];		/* with TOD-clock steering */
+	__u8 kmac[16];		/* with MSA */
+	__u8 kmc[16];		/* with MSA */
+	__u8 km[16];		/* with MSA */
+	__u8 kimd[16];		/* with MSA */
+	__u8 klmd[16];		/* with MSA */
+	__u8 pckmo[16];		/* with MSA3 */
+	__u8 kmctr[16];		/* with MSA4 */
+	__u8 kmf[16];		/* with MSA4 */
+	__u8 kmo[16];		/* with MSA4 */
+	__u8 pcc[16];		/* with MSA4 */
+	__u8 ppno[16];		/* with MSA5 */
+	__u8 reserved[1824];
+};
+
 /* kvm attributes for crypto */
 #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW	0
 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW	1
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2b5c14da32273a..f746a35e39500d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -36,6 +36,8 @@
 #include <asm/switch_to.h>
 #include <asm/isc.h>
 #include <asm/sclp.h>
+#include <asm/cpacf.h>
+#include <asm/etr.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -135,6 +137,8 @@ unsigned long kvm_s390_fac_list_mask_size(void)
 
 /* available cpu features supported by kvm */
 static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+/* available subfunctions indicated via query / "test bit" */
+static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
 
 static struct gmap_notifier gmap_notifier;
 debug_info_t *kvm_s390_dbf;
@@ -198,8 +202,52 @@ static void allow_cpu_feat(unsigned long nr)
 	set_bit_inv(nr, kvm_s390_available_cpu_feat);
 }
 
+static inline int plo_test_bit(unsigned char nr)
+{
+	register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+	int cc = 3; /* subfunction not available */
+
+	asm volatile(
+		/* Parameter registers are ignored for "test bit" */
+		"	plo	0,0,0,0(0)\n"
+		"	ipm	%0\n"
+		"	srl	%0,28\n"
+		: "=d" (cc)
+		: "d" (r0)
+		: "cc");
+	return cc == 0;
+}
+
 static void kvm_s390_cpu_feat_init(void)
 {
+	int i;
+
+	for (i = 0; i < 256; ++i) {
+		if (plo_test_bit(i))
+			kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+	}
+
+	if (test_facility(28)) /* TOD-clock steering */
+		etr_ptff(kvm_s390_available_subfunc.ptff, ETR_PTFF_QAF);
+
+	if (test_facility(17)) { /* MSA */
+		__cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+		__cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+		__cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+		__cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+		__cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+	}
+	if (test_facility(76)) /* MSA3 */
+		__cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+	if (test_facility(77)) { /* MSA4 */
+		__cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+		__cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+		__cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+		__cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+	}
+	if (test_facility(57)) /* MSA5 */
+		__cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+
 	if (MACHINE_HAS_ESOP)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
 }
@@ -717,6 +765,16 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
 	return ret;
 }
 
+static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+					  struct kvm_device_attr *attr)
+{
+	/*
+	 * Once supported by kernel + hw, we have to store the subfunctions
+	 * in kvm->arch and remember that user space configured them.
+	 */
+	return -ENXIO;
+}
+
 static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret = -ENXIO;
@@ -728,6 +786,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_PROCESSOR_FEAT:
 		ret = kvm_s390_set_processor_feat(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+		ret = kvm_s390_set_processor_subfunc(kvm, attr);
+		break;
 	}
 	return ret;
 }
@@ -801,6 +862,25 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm,
 	return 0;
 }
 
+static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+					  struct kvm_device_attr *attr)
+{
+	/*
+	 * Once we can actually configure subfunctions (kernel + hw support),
+	 * we have to check if they were already set by user space, if so copy
+	 * them from kvm->arch.
+	 */
+	return -ENXIO;
+}
+
+static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+					struct kvm_device_attr *attr)
+{
+	if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+	    sizeof(struct kvm_s390_vm_cpu_subfunc)))
+		return -EFAULT;
+	return 0;
+}
 static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret = -ENXIO;
@@ -818,6 +898,12 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_MACHINE_FEAT:
 		ret = kvm_s390_get_machine_feat(kvm, attr);
 		break;
+	case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+		ret = kvm_s390_get_processor_subfunc(kvm, attr);
+		break;
+	case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+		ret = kvm_s390_get_machine_subfunc(kvm, attr);
+		break;
 	}
 	return ret;
 }
@@ -903,8 +989,11 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 		case KVM_S390_VM_CPU_MACHINE:
 		case KVM_S390_VM_CPU_PROCESSOR_FEAT:
 		case KVM_S390_VM_CPU_MACHINE_FEAT:
+		case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
 			ret = 0;
 			break;
+		/* configuring subfunctions is not supported yet */
+		case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
 		default:
 			ret = -ENXIO;
 			break;

From 4013ade3fb2fefa021827d675d8bc1d51f4aef93 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 12:49:43 +0100
Subject: [PATCH 023/302] s390/sclp: detect 64-bit-SCAO facility

Let's correctly detect that facility, so we can correctly handle its
abscence later on.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 1 +
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 49736a0d4e0e9a..521400086e6559 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -59,6 +59,7 @@ struct sclp_info {
 	unsigned char has_hvs : 1;
 	unsigned char has_esca : 1;
 	unsigned char has_sief2 : 1;
+	unsigned char has_64bscao : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 0ac520dd1b219f..211eb86ae62d6b 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -114,6 +114,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 	sclp.facilities = sccb->facilities;
 	sclp.has_sprp = !!(sccb->fac84 & 0x02);
 	sclp.has_core_type = !!(sccb->fac84 & 0x01);
+	sclp.has_64bscao = !!(sccb->fac116 & 0x80);
 	sclp.has_esca = !!(sccb->fac116 & 0x08);
 	sclp.has_hvs = !!(sccb->fac119 & 0x80);
 	if (sccb->fac85 & 0x02)

From 76a6dd7241ae03c47f44a9605dcd525f31b2124a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 13:33:49 +0100
Subject: [PATCH 024/302] KVM: s390: handle missing 64-bit-SCAO facility

Without that facility, we may only use scaol. So fallback
to DMA allocation in that case, so we won't overwrite random memory
via the SIE.

Also disallow ESCA, so we don't have to handle that allocation case.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f746a35e39500d..efb902cdd1d24b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -317,8 +317,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		break;
 	case KVM_CAP_NR_VCPUS:
 	case KVM_CAP_MAX_VCPUS:
-		r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
-				  : KVM_S390_BSCA_CPU_SLOTS;
+		r = KVM_S390_BSCA_CPU_SLOTS;
+		if (sclp.has_esca && sclp.has_64bscao)
+			r = KVM_S390_ESCA_CPU_SLOTS;
 		break;
 	case KVM_CAP_NR_MEMSLOTS:
 		r = KVM_USER_MEM_SLOTS;
@@ -1295,6 +1296,7 @@ static void sca_dispose(struct kvm *kvm)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+	gfp_t alloc_flags = GFP_KERNEL;
 	int i, rc;
 	char debug_name[16];
 	static unsigned long sca_offset;
@@ -1319,8 +1321,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
 
 	kvm->arch.use_esca = 0; /* start with basic SCA */
+	if (!sclp.has_64bscao)
+		alloc_flags |= GFP_DMA;
 	rwlock_init(&kvm->arch.sca_lock);
-	kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+	kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
 	if (!kvm->arch.sca)
 		goto out_err;
 	spin_lock(&kvm_lock);
@@ -1567,7 +1571,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
 
 	if (id < KVM_S390_BSCA_CPU_SLOTS)
 		return true;
-	if (!sclp.has_esca)
+	if (!sclp.has_esca || !sclp.has_64bscao)
 		return false;
 
 	mutex_lock(&kvm->lock);

From b9e28897e6e9f82585ecf6ea45942866ece7d167 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 12:51:52 +0100
Subject: [PATCH 025/302] s390/sclp: detect guest-PER enhancement

Let's detect that facility, so we can correctly handle its abscence.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 4 +++-
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 521400086e6559..076f6318b6fa5d 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -33,7 +33,8 @@ struct sclp_core_entry {
 	u8 : 4;
 	u8 sief2 : 1;
 	u8 : 3;
-	u8 : 3;
+	u8 : 2;
+	u8 gpere : 1;
 	u8 siif : 1;
 	u8 sigpif : 1;
 	u8 : 3;
@@ -60,6 +61,7 @@ struct sclp_info {
 	unsigned char has_esca : 1;
 	unsigned char has_sief2 : 1;
 	unsigned char has_64bscao : 1;
+	unsigned char has_gpere : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 211eb86ae62d6b..a05f2d07ea02b9 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -146,6 +146,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 		sclp.has_siif = cpue->siif;
 		sclp.has_sigpif = cpue->sigpif;
 		sclp.has_sief2 = cpue->sief2;
+		sclp.has_gpere = cpue->gpere;
 		break;
 	}
 

From 89b5b4de33902a57cb9c8f2d06de4ffbc921de15 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 13:47:13 +0100
Subject: [PATCH 026/302] KVM: s390: guestdbg: signal missing hardware support

Without guest-PER enhancement, we can't provide any debugging support.
Therefore act like kernel support is missing.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index efb902cdd1d24b..e477c8e5b5c176 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2179,6 +2179,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 	if (dbg->control & ~VALID_GUESTDBG_FLAGS)
 		return -EINVAL;
+	if (!sclp.has_gpere)
+		return -EINVAL;
 
 	if (dbg->control & KVM_GUESTDBG_ENABLE) {
 		vcpu->guest_debug = dbg->control;

From 09be9cb92bb9e799bdbfd3834595bd6b4703b40b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 12:55:35 +0100
Subject: [PATCH 027/302] s390/sclp: detect cmma

Let's detect the Collaborative-memory-management-interpretation facility,
aka CMM assist, so we can correctly enable cmma later.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 1 +
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 076f6318b6fa5d..fa40ac8056f55f 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -62,6 +62,7 @@ struct sclp_info {
 	unsigned char has_sief2 : 1;
 	unsigned char has_64bscao : 1;
 	unsigned char has_gpere : 1;
+	unsigned char has_cmma : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index a05f2d07ea02b9..366e1a46e96d7a 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -115,6 +115,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 	sclp.has_sprp = !!(sccb->fac84 & 0x02);
 	sclp.has_core_type = !!(sccb->fac84 & 0x01);
 	sclp.has_64bscao = !!(sccb->fac116 & 0x80);
+	sclp.has_cmma = !!(sccb->fac116 & 0x40);
 	sclp.has_esca = !!(sccb->fac116 & 0x08);
 	sclp.has_hvs = !!(sccb->fac119 & 0x80);
 	if (sccb->fac85 & 0x02)

From c24cc9c8a6ca798427d3ff46b55df8403361df3e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 13:53:04 +0100
Subject: [PATCH 028/302] KVM: s390: enable CMMA if the interpration is
 available

Now that we can detect if collaborative-memory-management interpretation
is available, replace the heuristic by a real hardware detection.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e477c8e5b5c176..005e664f636057 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -485,9 +485,8 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 	unsigned int idx;
 	switch (attr->attr) {
 	case KVM_S390_VM_MEM_ENABLE_CMMA:
-		/* enable CMMA only for z10 and later (EDAT_1) */
 		ret = -EINVAL;
-		if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+		if (!sclp.has_cmma)
 			break;
 
 		ret = -EBUSY;

From f9cbd9b02539330ddd349df583fcfc2db8a23b90 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 3 Mar 2016 09:48:47 +0100
Subject: [PATCH 029/302] KVM: s390: provide CMMA attributes only if available

Let's not provide the device attribute for cmma enabling and clearing
if the hardware doesn't support it.

This also helps getting rid of the undocumented return value "-EINVAL"
in case CMMA is not available when trying to enable it.

Also properly document the meaning of -EINVAL for CMMA clearing.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt        | 2 ++
 Documentation/virtual/kvm/devices/vm.txt | 3 ++-
 arch/s390/kvm/kvm-s390.c                 | 7 ++++++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a4482cce4bae04..4aac3e51bf9f67 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2520,6 +2520,7 @@ Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
   ENXIO:  The group or attribute is unknown/unsupported for this device
+          or hardware support is missing.
   EPERM:  The attribute cannot (currently) be accessed this way
           (e.g. read-only attribute, or attribute that only makes
           sense when the device is in a different state)
@@ -2547,6 +2548,7 @@ Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
   ENXIO:  The group or attribute is unknown/unsupported for this device
+          or hardware support is missing.
 
 Tests whether a device supports a particular attribute.  A successful
 return indicates the attribute is implemented.  It does not necessarily
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index 8a458f42ded286..b6cda49f2ba418 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -20,7 +20,8 @@ Enables Collaborative Memory Management Assist (CMMA) for the virtual machine.
 
 1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA
 Parameters: none
-Returns: 0
+Returns: -EINVAL if CMMA was not enabled
+         0 otherwise
 
 Clear the CMMA status for all guest pages, so any pages the guest marked
 as unused are again used any may not be reclaimed by the host.
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 005e664f636057..f695c6e08337b8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -485,7 +485,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 	unsigned int idx;
 	switch (attr->attr) {
 	case KVM_S390_VM_MEM_ENABLE_CMMA:
-		ret = -EINVAL;
+		ret = -ENXIO;
 		if (!sclp.has_cmma)
 			break;
 
@@ -499,6 +499,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		mutex_unlock(&kvm->lock);
 		break;
 	case KVM_S390_VM_MEM_CLR_CMMA:
+		ret = -ENXIO;
+		if (!sclp.has_cmma)
+			break;
 		ret = -EINVAL;
 		if (!kvm->arch.use_cmma)
 			break;
@@ -964,6 +967,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 		switch (attr->attr) {
 		case KVM_S390_VM_MEM_ENABLE_CMMA:
 		case KVM_S390_VM_MEM_CLR_CMMA:
+			ret = sclp.has_cmma ? 0 : -ENXIO;
+			break;
 		case KVM_S390_VM_MEM_LIMIT_SIZE:
 			ret = 0;
 			break;

From 5236c751da5e6ccfda4e5d53690a37dfb456997b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 12:53:46 +0100
Subject: [PATCH 030/302] s390/sclp: detect guest-storage-limit-suppression

Let's detect that facility.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 1 +
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index fa40ac8056f55f..e1450dd9d93297 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -63,6 +63,7 @@ struct sclp_info {
 	unsigned char has_64bscao : 1;
 	unsigned char has_gpere : 1;
 	unsigned char has_cmma : 1;
+	unsigned char has_gsls : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 366e1a46e96d7a..99fce6b784bf71 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -114,6 +114,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 	sclp.facilities = sccb->facilities;
 	sclp.has_sprp = !!(sccb->fac84 & 0x02);
 	sclp.has_core_type = !!(sccb->fac84 & 0x01);
+	sclp.has_gsls = !!(sccb->fac85 & 0x80);
 	sclp.has_64bscao = !!(sccb->fac116 & 0x80);
 	sclp.has_cmma = !!(sccb->fac116 & 0x40);
 	sclp.has_esca = !!(sccb->fac116 & 0x08);

From efed110446226c725268a1f980806d799990a979 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2015 12:32:41 +0200
Subject: [PATCH 031/302] KVM: s390: handle missing
 guest-storage-limit-suppression

If guest-storage-limit-suppression is not available, we would for now
have a valid guest address space with size 0. So let's simply set the
origin to 0 and the limit to hamax.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h | 4 +++-
 arch/s390/kvm/kvm-s390.c         | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index b2a83a0ce42cc6..9eed5c18a61c39 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -186,7 +186,9 @@ struct kvm_s390_sie_block {
 	__u32	scaol;			/* 0x0064 */
 	__u8	reserved68[4];		/* 0x0068 */
 	__u32	todpr;			/* 0x006c */
-	__u8	reserved70[32];		/* 0x0070 */
+	__u8	reserved70[16];		/* 0x0070 */
+	__u64	mso;			/* 0x0080 */
+	__u64	msl;			/* 0x0088 */
 	psw_t	gpsw;			/* 0x0090 */
 	__u64	gg14;			/* 0x00a0 */
 	__u64	gg15;			/* 0x00a8 */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f695c6e08337b8..2a239554eb890f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1897,6 +1897,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	vcpu->arch.sie_block = &sie_page->sie_block;
 	vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
 
+	/* the real guest size will always be smaller than msl */
+	vcpu->arch.sie_block->mso = 0;
+	vcpu->arch.sie_block->msl = sclp.hamax;
+
 	vcpu->arch.sie_block->icpua = id;
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	vcpu->arch.local_int.float_int = &kvm->arch.float_int;

From 72cd82b9e9d075713367ad840c2a9b52b4cd447d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 12:59:03 +0100
Subject: [PATCH 032/302] s390/sclp: detect intervention bypass facility

Let's detect if we have the intervention bypass facility installed.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 7 ++++++-
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index e1450dd9d93297..ef1f427ad4d1a7 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -38,7 +38,11 @@ struct sclp_core_entry {
 	u8 siif : 1;
 	u8 sigpif : 1;
 	u8 : 3;
-	u8 reserved2[10];
+	u8 reserved2[3];
+	u8 : 2;
+	u8 ib : 1;
+	u8 : 5;
+	u8 reserved3[6];
 	u8 type;
 	u8 reserved1;
 } __attribute__((packed));
@@ -64,6 +68,7 @@ struct sclp_info {
 	unsigned char has_gpere : 1;
 	unsigned char has_cmma : 1;
 	unsigned char has_gsls : 1;
+	unsigned char has_ib : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 99fce6b784bf71..2240b615131e8a 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -149,6 +149,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 		sclp.has_sigpif = cpue->sigpif;
 		sclp.has_sief2 = cpue->sief2;
 		sclp.has_gpere = cpue->gpere;
+		sclp.has_ib = cpue->ib;
 		break;
 	}
 

From 11ad65b79e8c27cdafe404e33938da270a55858a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 4 Apr 2016 15:46:26 +0200
Subject: [PATCH 033/302] KVM: s390: enable ib only if available

Let's enable intervention bypass only if the facility is acutally
available.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2a239554eb890f..340fb405bc2386 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1845,7 +1845,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	if (test_kvm_facility(vcpu->kvm, 8))
 		vcpu->arch.sie_block->ecb2 |= 0x08;
-	vcpu->arch.sie_block->eca   = 0xC1002000U;
+	vcpu->arch.sie_block->eca = 0x81002000U;
+	if (sclp.has_ib)
+		vcpu->arch.sie_block->eca |= 0x40000000U;
 	if (sclp.has_siif)
 		vcpu->arch.sie_block->eca |= 1;
 	if (sclp.has_sigpif)

From 4a5c3e08271216891ce1b5315cec3dcadbd01cd4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 13:00:23 +0100
Subject: [PATCH 034/302] s390/sclp: detect conditional-external-interception
 facility

Let's detect if we have that facility.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 4 +++-
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index ef1f427ad4d1a7..c91ad198a59c8a 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -41,7 +41,8 @@ struct sclp_core_entry {
 	u8 reserved2[3];
 	u8 : 2;
 	u8 ib : 1;
-	u8 : 5;
+	u8 cei : 1;
+	u8 : 4;
 	u8 reserved3[6];
 	u8 type;
 	u8 reserved1;
@@ -69,6 +70,7 @@ struct sclp_info {
 	unsigned char has_cmma : 1;
 	unsigned char has_gsls : 1;
 	unsigned char has_ib : 1;
+	unsigned char has_cei : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 2240b615131e8a..4b330fbd4f087f 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -150,6 +150,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 		sclp.has_sief2 = cpue->sief2;
 		sclp.has_gpere = cpue->gpere;
 		sclp.has_ib = cpue->ib;
+		sclp.has_cei = cpue->cei;
 		break;
 	}
 

From 48ee7d3a7f8f3ca90dfc5e1103e68c0044051acc Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 4 Apr 2016 15:49:34 +0200
Subject: [PATCH 035/302] KVM: s390: enable cei only if available

Let's only enable conditional-external-interruption if the facility is
actually available.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 340fb405bc2386..1a239a6748fe2f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1845,7 +1845,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	if (test_kvm_facility(vcpu->kvm, 8))
 		vcpu->arch.sie_block->ecb2 |= 0x08;
-	vcpu->arch.sie_block->eca = 0x81002000U;
+	vcpu->arch.sie_block->eca = 0x1002000U;
+	if (sclp.has_cei)
+		vcpu->arch.sie_block->eca |= 0x80000000U;
 	if (sclp.has_ib)
 		vcpu->arch.sie_block->eca |= 0x40000000U;
 	if (sclp.has_siif)

From a0eb55e6318f1bcfe93b01f0944622f14a6b2977 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 13:02:25 +0100
Subject: [PATCH 036/302] s390/sclp: detect PFMF interpretation facility

Let's detect that facility.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 1 +
 drivers/s390/char/sclp_early.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index c91ad198a59c8a..bdb7f22d9ad448 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -71,6 +71,7 @@ struct sclp_info {
 	unsigned char has_gsls : 1;
 	unsigned char has_ib : 1;
 	unsigned char has_cei : 1;
+	unsigned char has_pfmfi : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 4b330fbd4f087f..500cbfd835411c 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -46,7 +46,8 @@ struct read_info_sccb {
 	u64	rnmax2;			/* 104-111 */
 	u8	_pad_112[116 - 112];	/* 112-115 */
 	u8	fac116;			/* 116 */
-	u8	_pad_117[119 - 117];	/* 117-118 */
+	u8	fac117;			/* 117 */
+	u8	_pad_118;		/* 118 */
 	u8	fac119;			/* 119 */
 	u16	hcpua;			/* 120-121 */
 	u8	_pad_122[124 - 122];	/* 122-123 */
@@ -118,6 +119,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 	sclp.has_64bscao = !!(sccb->fac116 & 0x80);
 	sclp.has_cmma = !!(sccb->fac116 & 0x40);
 	sclp.has_esca = !!(sccb->fac116 & 0x08);
+	sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
 	sclp.has_hvs = !!(sccb->fac119 & 0x80);
 	if (sccb->fac85 & 0x02)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;

From 873b425e4c2fd0ba6617d67a45fbf119b65575b4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 4 Apr 2016 15:53:47 +0200
Subject: [PATCH 037/302] KVM: s390: enable PFMFI only if available

Let's enable interpretation of PFMFI only if the facility is
actually available. Emulation code still works in case the guest is
offered EDAT-1.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1a239a6748fe2f..d987eb8af05916 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1843,7 +1843,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
 		vcpu->arch.sie_block->ecb |= 0x10;
 
-	if (test_kvm_facility(vcpu->kvm, 8))
+	if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
 		vcpu->arch.sie_block->ecb2 |= 0x08;
 	vcpu->arch.sie_block->eca = 0x1002000U;
 	if (sclp.has_cei)

From 9c375490fc812ebdf3259ea2566c271d00544fc2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 13:02:52 +0100
Subject: [PATCH 038/302] s390/sclp: detect interlock-and-broadcast-suppression
 facility

Let's detect that facility.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 1 +
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index bdb7f22d9ad448..99a0150d07b975 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -72,6 +72,7 @@ struct sclp_info {
 	unsigned char has_ib : 1;
 	unsigned char has_cei : 1;
 	unsigned char has_pfmfi : 1;
+	unsigned char has_ibs : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 500cbfd835411c..d5b873c92ffc24 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -120,6 +120,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 	sclp.has_cmma = !!(sccb->fac116 & 0x40);
 	sclp.has_esca = !!(sccb->fac116 & 0x08);
 	sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
+	sclp.has_ibs = !!(sccb->fac117 & 0x20);
 	sclp.has_hvs = !!(sccb->fac119 & 0x80);
 	if (sccb->fac85 & 0x02)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;

From 09a400e78eaf02d8ab8e836edf864e1025c8e2d7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 4 Apr 2016 15:57:08 +0200
Subject: [PATCH 039/302] KVM: s390: enable ibs only if available

Let's enable interlock-and-broadcast suppression only if the facility is
actually available.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d987eb8af05916..ad93b40bfdc093 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2789,6 +2789,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
 
 static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 {
+	if (!sclp.has_ibs)
+		return;
 	kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
 	kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
 }

From bdab09f3d81c3fac6314012ca0eff1206ea067ab Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 12 Apr 2016 11:07:49 +0200
Subject: [PATCH 040/302] KVM: s390: enable host-protection-interruption only
 with ESOP

host-protection-interruption control was introduced with ESOP. So let's
enable it only if we have ESOP and add an explanatory comment why
we can live without it.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ad93b40bfdc093..4e764faed52477 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1837,7 +1837,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	kvm_s390_vcpu_setup_model(vcpu);
 
-	vcpu->arch.sie_block->ecb = 0x02;
+	/* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+	if (MACHINE_HAS_ESOP)
+		vcpu->arch.sie_block->ecb |= 0x02;
 	if (test_kvm_facility(vcpu->kvm, 9))
 		vcpu->arch.sie_block->ecb |= 0x04;
 	if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))

From f597d24eee2dd9486edaac7a1821f35bc4d349c2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 22 Apr 2016 16:26:49 +0200
Subject: [PATCH 041/302] KVM: s390: turn on tx even without ctx

Constrained transactional execution is an addon of transactional execution.

Let's enable the assist also if only TX is enabled for the guest.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4e764faed52477..9d0e4d0487f431 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1842,7 +1842,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 		vcpu->arch.sie_block->ecb |= 0x02;
 	if (test_kvm_facility(vcpu->kvm, 9))
 		vcpu->arch.sie_block->ecb |= 0x04;
-	if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+	if (test_kvm_facility(vcpu->kvm, 73))
 		vcpu->arch.sie_block->ecb |= 0x10;
 
 	if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)

From 1bb78d161feae5b613c80eb822059eec60d2a538 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 7 Jun 2016 09:57:08 +0200
Subject: [PATCH 042/302] KVM: s390: provide logging for diagnose 0x500

We might need to debug some virtio things, so better have diagnose 500
logged.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/kvm/diag.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1ea4095b67d729..ce865bd4f81d96 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
 	    (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
 		return -EOPNOTSUPP;
 
+	VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx",
+			    (u32) vcpu->run->s.regs.gprs[2],
+			    (u32) vcpu->run->s.regs.gprs[3],
+			    vcpu->run->s.regs.gprs[4]);
+
 	/*
 	 * The layout is as follows:
 	 * - gpr 2 contains the subchannel id (passed as addr)

From dcc98ea6146e4da27eee2f3e9983500e9618cc23 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 7 Jun 2016 09:37:17 +0200
Subject: [PATCH 043/302] KVM: s390: fixup I/O interrupt traces

We currently have two issues with the I/O  interrupt injection logging:
1. All QEMU versions up to 2.6 have a wrong encoding of device numbers
etc for the I/O interrupt type, so the inject VM_EVENT will have wrong
data. Let's fix this by using the interrupt parameters and not the
interrupt type number.
2. We only log in kvm_s390_inject_vm, but not when coming from
kvm_s390_reinject_io_int or from flic. Let's move the logging to the
common __inject_io function.

We also enhance the logging for delivery to match the data.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5a80af740d3eef..d72c4a877622af 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -28,9 +28,6 @@
 #include "gaccess.h"
 #include "trace-s390.h"
 
-#define IOINT_SCHID_MASK 0x0000ffff
-#define IOINT_SSID_MASK 0x00030000
-#define IOINT_CSSID_MASK 0x03fc0000
 #define PFAULT_INIT 0x0600
 #define PFAULT_DONE 0x0680
 #define VIRTIO_PARAM 0x0d00
@@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
 					struct kvm_s390_interrupt_info,
 					list);
 	if (inti) {
-		VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type);
+		if (inti->type & KVM_S390_INT_IO_AI_MASK)
+			VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)");
+		else
+			VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x",
+			inti->io.subchannel_id >> 8,
+			inti->io.subchannel_id >> 1 & 0x3,
+			inti->io.subchannel_nr);
+
 		vcpu->stat.deliver_io_int++;
 		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
 				inti->type,
@@ -1415,6 +1419,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 	}
 	fi->counters[FIRQ_CNTR_IO] += 1;
 
+	if (inti->type & KVM_S390_INT_IO_AI_MASK)
+		VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)");
+	else
+		VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x",
+			inti->io.subchannel_id >> 8,
+			inti->io.subchannel_id >> 1 & 0x3,
+			inti->io.subchannel_nr);
 	isc = int_word_to_isc(inti->io.io_int_word);
 	list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
 	list_add_tail(&inti->list, list);
@@ -1531,13 +1542,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 		inti->mchk.mcic = s390int->parm64;
 		break;
 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-		if (inti->type & KVM_S390_INT_IO_AI_MASK)
-			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
-		else
-			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
-				 s390int->type & IOINT_CSSID_MASK,
-				 s390int->type & IOINT_SSID_MASK,
-				 s390int->type & IOINT_SCHID_MASK);
 		inti->io.subchannel_id = s390int->parm >> 16;
 		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
 		inti->io.io_int_parm = s390int->parm64 >> 32;

From c427c42cd612719e8fb8b5891cc9761e7770024e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 10 May 2016 13:51:54 +0200
Subject: [PATCH 044/302] s390/mm: don't drop errors in get_guest_storage_key

Commit 1e133ab296f3 ("s390/mm: split arch/s390/mm/pgtable.c") changed
the return value of get_guest_storage_key to an unsigned char, resulting
in -EFAULT getting interpreted as a valid storage key.

Cc: stable@vger.kernel.org # 4.6+
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h | 2 +-
 arch/s390/mm/pgtable.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 18d2beb89340a6..42b968a8586304 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -893,7 +893,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char key, bool nq);
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
 
 /*
  * Certain architectures need to do special things when PTEs
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4324b87f93982f..2a23ca96f9c29b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -543,7 +543,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(set_guest_storage_key);
 
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
 {
 	unsigned char key;
 	spinlock_t *ptl;

From d3ed1ceeace311af9973d17a07a114bfaf0ca1b1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:53:35 +0100
Subject: [PATCH 045/302] s390/mm: set and get guest storage key mmap locking

Move the mmap semaphore locking out of set_guest_storage_key
and get_guest_storage_key. This makes the two functions more
like the other ptep_xxx operations and allows to avoid repeated
semaphore operations if multiple keys are read or written.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 26 ++++++++++++++++----------
 arch/s390/kvm/priv.c     |  7 +++++--
 arch/s390/mm/pgtable.c   | 15 +++------------
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 9d0e4d0487f431..d0156d7969e039 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1050,26 +1050,30 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	if (!keys)
 		return -ENOMEM;
 
+	down_read(&current->mm->mmap_sem);
 	for (i = 0; i < args->count; i++) {
 		hva = gfn_to_hva(kvm, args->start_gfn + i);
 		if (kvm_is_error_hva(hva)) {
 			r = -EFAULT;
-			goto out;
+			break;
 		}
 
 		curkey = get_guest_storage_key(current->mm, hva);
 		if (IS_ERR_VALUE(curkey)) {
 			r = curkey;
-			goto out;
+			break;
 		}
 		keys[i] = curkey;
 	}
+	up_read(&current->mm->mmap_sem);
+
+	if (!r) {
+		r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+				 sizeof(uint8_t) * args->count);
+		if (r)
+			r = -EFAULT;
+	}
 
-	r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
-			 sizeof(uint8_t) * args->count);
-	if (r)
-		r = -EFAULT;
-out:
 	kvfree(keys);
 	return r;
 }
@@ -1106,24 +1110,26 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	if (r)
 		goto out;
 
+	down_read(&current->mm->mmap_sem);
 	for (i = 0; i < args->count; i++) {
 		hva = gfn_to_hva(kvm, args->start_gfn + i);
 		if (kvm_is_error_hva(hva)) {
 			r = -EFAULT;
-			goto out;
+			break;
 		}
 
 		/* Lowest order bit is reserved */
 		if (keys[i] & 0x01) {
 			r = -EINVAL;
-			goto out;
+			break;
 		}
 
 		r = set_guest_storage_key(current->mm, hva,
 					  (unsigned long)keys[i], 0);
 		if (r)
-			goto out;
+			break;
 	}
+	up_read(&current->mm->mmap_sem);
 out:
 	kvfree(keys);
 	return r;
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 95916fa7c670be..c6deed782c615c 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -728,9 +728,12 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
 			if (rc)
 				return rc;
-			if (set_guest_storage_key(current->mm, useraddr,
+			down_read(&current->mm->mmap_sem);
+			rc = set_guest_storage_key(current->mm, useraddr,
 					vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
-					vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+					vcpu->run->s.regs.gprs[reg1] & PFMF_NQ);
+			up_read(&current->mm->mmap_sem);
+			if (rc)
 				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		}
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 2a23ca96f9c29b..7612a7c3a3a8bc 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -506,12 +506,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	pgste_t old, new;
 	pte_t *ptep;
 
-	down_read(&mm->mmap_sem);
 	ptep = get_locked_pte(mm, addr, &ptl);
-	if (unlikely(!ptep)) {
-		up_read(&mm->mmap_sem);
+	if (unlikely(!ptep))
 		return -EFAULT;
-	}
 
 	new = old = pgste_get_lock(ptep);
 	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@ -538,7 +535,6 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 
 	pgste_set_unlock(ptep, new);
 	pte_unmap_unlock(ptep, ptl);
-	up_read(&mm->mmap_sem);
 	return 0;
 }
 EXPORT_SYMBOL(set_guest_storage_key);
@@ -550,14 +546,11 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
 	pgste_t pgste;
 	pte_t *ptep;
 
-	down_read(&mm->mmap_sem);
 	ptep = get_locked_pte(mm, addr, &ptl);
-	if (unlikely(!ptep)) {
-		up_read(&mm->mmap_sem);
+	if (unlikely(!ptep))
 		return -EFAULT;
-	}
-	pgste = pgste_get_lock(ptep);
 
+	pgste = pgste_get_lock(ptep);
 	if (pte_val(*ptep) & _PAGE_INVALID) {
 		key  = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
 		key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
@@ -572,10 +565,8 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
 		if (pgste_val(pgste) & PGSTE_GC_BIT)
 			key |= _PAGE_CHANGED;
 	}
-
 	pgste_set_unlock(ptep, pgste);
 	pte_unmap_unlock(ptep, ptl);
-	up_read(&mm->mmap_sem);
 	return key;
 }
 EXPORT_SYMBOL(get_guest_storage_key);

From 8d6037a7b4f21708451d4aec14828f9ebe77b37a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 11:15:32 +0200
Subject: [PATCH 046/302] s390/mm: simplify get_guest_storage_key

We can safe a few LOC and make that function easier to understand
by rewriting existing code.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/pgtable.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 7612a7c3a3a8bc..4c8d572d59cc54 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -551,20 +551,11 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
 		return -EFAULT;
 
 	pgste = pgste_get_lock(ptep);
-	if (pte_val(*ptep) & _PAGE_INVALID) {
-		key  = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
-		key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
-		key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
-		key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
-	} else {
+	key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+	if (!(pte_val(*ptep) & _PAGE_INVALID))
 		key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
-
-		/* Reflect guest's logical view, not physical */
-		if (pgste_val(pgste) & PGSTE_GR_BIT)
-			key |= _PAGE_REFERENCED;
-		if (pgste_val(pgste) & PGSTE_GC_BIT)
-			key |= _PAGE_CHANGED;
-	}
+	/* Reflect guest's logical view, not physical */
+	key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
 	pgste_set_unlock(ptep, pgste);
 	pte_unmap_unlock(ptep, ptl);
 	return key;

From 154c8c19c35b6da94a623cb793458e203572083d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 11:22:34 +0200
Subject: [PATCH 047/302] s390/mm: return key via pointer in
 get_guest_storage_key

Let's just split returning the key and reporting errors. This makes calling
code easier and avoids bugs as happened already.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h |  3 ++-
 arch/s390/kvm/kvm-s390.c        |  8 ++------
 arch/s390/mm/pgtable.c          | 12 ++++++------
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 42b968a8586304..91f0e7b79821bd 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -893,7 +893,8 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char key, bool nq);
-unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			  unsigned char *key);
 
 /*
  * Certain architectures need to do special things when PTEs
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d0156d7969e039..ad166c6698e07b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1029,7 +1029,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
 	uint8_t *keys;
 	uint64_t hva;
-	unsigned long curkey;
 	int i, r = 0;
 
 	if (args->flags != 0)
@@ -1058,12 +1057,9 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 			break;
 		}
 
-		curkey = get_guest_storage_key(current->mm, hva);
-		if (IS_ERR_VALUE(curkey)) {
-			r = curkey;
+		r = get_guest_storage_key(current->mm, hva, &keys[i]);
+		if (r)
 			break;
-		}
-		keys[i] = curkey;
 	}
 	up_read(&current->mm->mmap_sem);
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4c8d572d59cc54..3e35298758d6db 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -539,9 +539,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(set_guest_storage_key);
 
-unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			  unsigned char *key)
 {
-	unsigned char key;
 	spinlock_t *ptl;
 	pgste_t pgste;
 	pte_t *ptep;
@@ -551,14 +551,14 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
 		return -EFAULT;
 
 	pgste = pgste_get_lock(ptep);
-	key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
 	if (!(pte_val(*ptep) & _PAGE_INVALID))
-		key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+		*key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
 	/* Reflect guest's logical view, not physical */
-	key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
+	*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
 	pgste_set_unlock(ptep, pgste);
 	pte_unmap_unlock(ptep, ptl);
-	return key;
+	return 0;
 }
 EXPORT_SYMBOL(get_guest_storage_key);
 #endif

From fe69eabf8deb85ae8b2958830ea3b2911e332820 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 9 May 2016 13:08:07 +0200
Subject: [PATCH 048/302] KVM: s390: storage keys fit into a char

No need to convert the storage key into an unsigned long, the target
function expects a char as argument.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ad166c6698e07b..49c60393a15ce1 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1120,8 +1120,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 			break;
 		}
 
-		r = set_guest_storage_key(current->mm, hva,
-					  (unsigned long)keys[i], 0);
+		r = set_guest_storage_key(current->mm, hva, keys[i], 0);
 		if (r)
 			break;
 	}

From 6164a2e90a5b6c5c32ccfe7a1baff80d603d702d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 13 Apr 2016 10:09:47 +0200
Subject: [PATCH 049/302] KVM: s390: pfmf: fix end address calculation

The current calculation is wrong if absolute != real address. Let's just
calculate the start address for 4k frames upfront. Otherwise, the
calculated end address will be wrong, resulting in wrong memory
location/storage keys getting touched.

To keep low-address protection working (using the effective address),
we have to move the check.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/priv.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index c6deed782c615c..bfba98302ca035 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -682,8 +682,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	start = kvm_s390_logical_to_effective(vcpu, start);
 
+	if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+		if (kvm_s390_check_low_addr_prot_real(vcpu, start))
+			return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+	}
+
 	switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
 	case 0x00000000:
+		/* only 4k frames specify a real address */
+		start = kvm_s390_real_to_abs(vcpu, start);
 		end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
 		break;
 	case 0x00001000:
@@ -701,20 +708,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 	}
 
-	if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
-		if (kvm_s390_check_low_addr_prot_real(vcpu, start))
-			return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
-	}
-
 	while (start < end) {
-		unsigned long useraddr, abs_addr;
+		unsigned long useraddr;
 
 		/* Translate guest address to host address */
-		if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
-			abs_addr = kvm_s390_real_to_abs(vcpu, start);
-		else
-			abs_addr = start;
-		useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+		useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
 		if (kvm_is_error_hva(useraddr))
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 

From 9a68f0af8cd907452fa6c33343d38cdacff96294 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 13 Apr 2016 12:09:58 +0200
Subject: [PATCH 050/302] KVM: s390: pfmf: MR and MC are ignored without CSSKE

These two bits are simply ignored when the conditional-SSKE facility is
not installed.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/priv.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index bfba98302ca035..5c926b74d7ca83 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -675,10 +675,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	    !test_kvm_facility(vcpu->kvm, 14))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	/* No support for conditional-SSKE */
-	if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
-		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	start = kvm_s390_logical_to_effective(vcpu, start);
 

From 2c26d1d23abd9a67d056c95a0823132a71edc477 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 13 Apr 2016 15:47:21 +0200
Subject: [PATCH 051/302] KVM: s390: pfmf: take care of amode when setting reg2

Depending on the addressing mode, we must not overwrite bit 0-31 of the
register. In addition, 24 bit and 31 bit have to set certain bits to 0,
which is guaranteed by converting the end address to an effective
address.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/priv.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 5c926b74d7ca83..71fa603034d057 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -733,8 +733,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
 		start += PAGE_SIZE;
 	}
-	if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
-		vcpu->run->s.regs.gprs[reg2] = end;
+	if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+		if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+			vcpu->run->s.regs.gprs[reg2] = end;
+		} else {
+			vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
+			end = kvm_s390_logical_to_effective(vcpu, end);
+			vcpu->run->s.regs.gprs[reg2] |= end;
+		}
+	}
 	return 0;
 }
 

From 1824c723ac90f9870ebafae4b3b3e5f4b82ffeef Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 10 May 2016 09:43:11 +0200
Subject: [PATCH 052/302] KVM: s390: pfmf: support conditional-sske facility

We already indicate that facility but don't implement it in our pfmf
interception handler. Let's add a new storage key handling function for
conditionally setting the guest storage key.

As we will reuse this function later on, let's directly implement returning
the old key via parameter and indicating if any change happened via rc.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h |  3 +++
 arch/s390/kvm/priv.c            | 18 ++++++++++++++----
 arch/s390/mm/pgtable.c          | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 91f0e7b79821bd..2f6702e27db9ce 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -893,6 +893,9 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char key, bool nq);
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			       unsigned char key, unsigned char *oldkey,
+			       bool nq, bool mr, bool mc);
 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char *key);
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 71fa603034d057..752a1ac1aab626 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -654,8 +654,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
 
 static int handle_pfmf(struct kvm_vcpu *vcpu)
 {
+	bool mr = false, mc = false, nq;
 	int reg1, reg2;
 	unsigned long start, end;
+	unsigned char key;
 
 	vcpu->stat.instruction_pfmf++;
 
@@ -675,6 +677,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	    !test_kvm_facility(vcpu->kvm, 14))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
+	/* Only provide conditional-SSKE support if enabled for the guest */
+	if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK &&
+	    test_kvm_facility(vcpu->kvm, 10)) {
+		mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR;
+		mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC;
+	}
+
+	nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
+	key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	start = kvm_s390_logical_to_effective(vcpu, start);
 
@@ -723,11 +734,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 			if (rc)
 				return rc;
 			down_read(&current->mm->mmap_sem);
-			rc = set_guest_storage_key(current->mm, useraddr,
-					vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
-					vcpu->run->s.regs.gprs[reg1] & PFMF_NQ);
+			rc = cond_set_guest_storage_key(current->mm, useraddr,
+							key, NULL, nq, mr, mc);
 			up_read(&current->mm->mmap_sem);
-			if (rc)
+			if (rc < 0)
 				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 		}
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 3e35298758d6db..e791e8b27fd20b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -539,6 +539,39 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(set_guest_storage_key);
 
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			       unsigned char key, unsigned char *oldkey,
+			       bool nq, bool mr, bool mc)
+{
+	unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+	int rc;
+
+	/* we can drop the pgste lock between getting and setting the key */
+	if (mr | mc) {
+		rc = get_guest_storage_key(current->mm, addr, &tmp);
+		if (rc)
+			return rc;
+		if (oldkey)
+			*oldkey = tmp;
+		if (!mr)
+			mask |= _PAGE_REFERENCED;
+		if (!mc)
+			mask |= _PAGE_CHANGED;
+		if (!((tmp ^ key) & mask))
+			return 0;
+	}
+	rc = set_guest_storage_key(current->mm, addr, key, nq);
+	return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char *key)
 {

From 695be0e7a24a8875c347437566f2c44ba673580b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 12 May 2016 14:07:05 +0200
Subject: [PATCH 053/302] KVM: s390: pfmf: handle address overflows

In theory, end could always end up being < start, if overflowing to 0.
Although very unlikely for now, let's just fix it.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/priv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 752a1ac1aab626..b8327b8fdb8f9d 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -715,7 +715,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 	}
 
-	while (start < end) {
+	while (start != end) {
 		unsigned long useraddr;
 
 		/* Translate guest address to host address */

From 238614515287c9400727e4cd7aa958649dcbf05f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 12:56:43 +0100
Subject: [PATCH 054/302] s390/sclp: detect storage-key facility

Let's correctly detect that facility.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 4 +++-
 drivers/s390/char/sclp_early.c | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 99a0150d07b975..2ad9c204b1a2fc 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -32,7 +32,8 @@ struct sclp_core_entry {
 	u8 reserved0;
 	u8 : 4;
 	u8 sief2 : 1;
-	u8 : 3;
+	u8 skey : 1;
+	u8 : 2;
 	u8 : 2;
 	u8 gpere : 1;
 	u8 siif : 1;
@@ -73,6 +74,7 @@ struct sclp_info {
 	unsigned char has_cei : 1;
 	unsigned char has_pfmfi : 1;
 	unsigned char has_ibs : 1;
+	unsigned char has_skey : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index d5b873c92ffc24..c71df0c7dedc1c 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -154,6 +154,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
 		sclp.has_gpere = cpue->gpere;
 		sclp.has_ib = cpue->ib;
 		sclp.has_cei = cpue->cei;
+		sclp.has_skey = cpue->skey;
 		break;
 	}
 

From 11ddcd41bce5c2394b0390584236afdd13656998 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 10 May 2016 09:40:09 +0200
Subject: [PATCH 055/302] KVM: s390: trace and count all skey intercepts

Let's trace and count all skey handling operations, even if lazy skey
handling was already activated. Also, don't enable lazy skey handling if
anything went wrong while enabling skey handling for the SIE.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/priv.c  | 13 ++++++++-----
 arch/s390/kvm/trace.h |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index b8327b8fdb8f9d..6745c2a602c343 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -152,24 +152,27 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
 	int rc = 0;
+
+	trace_kvm_s390_skey_related_inst(vcpu);
 	if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
 		return rc;
 
 	rc = s390_enable_skey();
-	VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest");
-	trace_kvm_s390_skey_related_inst(vcpu);
-	vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+	VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
+	if (!rc)
+		vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
 	return rc;
 }
 
 
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
-	int rc = __skey_check_enable(vcpu);
+	int rc;
 
+	vcpu->stat.instruction_storage_key++;
+	rc = __skey_check_enable(vcpu);
 	if (rc)
 		return rc;
-	vcpu->stat.instruction_storage_key++;
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 1c4586b367a443..4fc9d4e5be89c6 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst,
 	    TP_fast_assign(
 		    VCPU_ASSIGN_COMMON
 		    ),
-	    VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+	    VCPU_TP_PRINTK("%s", "storage key related instruction")
 	);
 
 TRACE_EVENT(kvm_s390_major_guest_pfault,

From a7e19ab55ffdd82f1a8d12694b9a0c0beeef534c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 10 May 2016 09:50:21 +0200
Subject: [PATCH 056/302] KVM: s390: handle missing storage-key facility

Without the storage-key facility, SIE won't interpret SSKE, ISKE and
RRBE for us. So let's add proper interception handlers that will be called
if lazy sske cannot be enabled.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/page.h    |   7 +-
 arch/s390/include/asm/pgtable.h |   1 +
 arch/s390/kvm/priv.c            | 150 ++++++++++++++++++++++++++++++--
 arch/s390/mm/pgtable.c          |  37 ++++++++
 4 files changed, 184 insertions(+), 11 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 53eacbd4f09bf4..f874e7d51c1919 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -109,13 +109,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr)
 
 static inline int page_reset_referenced(unsigned long addr)
 {
-	unsigned int ipm;
+	int cc;
 
 	asm volatile(
 		"	rrbe	0,%1\n"
 		"	ipm	%0\n"
-		: "=d" (ipm) : "a" (addr) : "cc");
-	return !!(ipm & 0x20000000);
+		"	srl	%0,28\n"
+		: "=d" (cc) : "a" (addr) : "cc");
+	return cc;
 }
 
 /* Bits int the storage key */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2f6702e27db9ce..9951e7e5975632 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -896,6 +896,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			       unsigned char key, unsigned char *oldkey,
 			       bool nq, bool mr, bool mc);
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char *key);
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 6745c2a602c343..3db3be13999299 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -27,6 +27,7 @@
 #include <asm/io.h>
 #include <asm/ptrace.h>
 #include <asm/compat.h>
+#include <asm/sclp.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
 #include "trace.h"
@@ -164,8 +165,7 @@ static int __skey_check_enable(struct kvm_vcpu *vcpu)
 	return rc;
 }
 
-
-static int handle_skey(struct kvm_vcpu *vcpu)
+static int try_handle_skey(struct kvm_vcpu *vcpu)
 {
 	int rc;
 
@@ -173,12 +173,146 @@ static int handle_skey(struct kvm_vcpu *vcpu)
 	rc = __skey_check_enable(vcpu);
 	if (rc)
 		return rc;
-
+	if (sclp.has_skey) {
+		/* with storage-key facility, SIE interprets it for us */
+		kvm_s390_retry_instr(vcpu);
+		VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+		return -EAGAIN;
+	}
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+	return 0;
+}
 
-	kvm_s390_retry_instr(vcpu);
-	VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+static int handle_iske(struct kvm_vcpu *vcpu)
+{
+	unsigned long addr;
+	unsigned char key;
+	int reg1, reg2;
+	int rc;
+
+	rc = try_handle_skey(vcpu);
+	if (rc)
+		return rc != -EAGAIN ? rc : 0;
+
+	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+	addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+	addr = kvm_s390_logical_to_effective(vcpu, addr);
+	addr = kvm_s390_real_to_abs(vcpu, addr);
+	addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+	if (kvm_is_error_hva(addr))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+	down_read(&current->mm->mmap_sem);
+	rc = get_guest_storage_key(current->mm, addr, &key);
+	up_read(&current->mm->mmap_sem);
+	if (rc)
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	vcpu->run->s.regs.gprs[reg1] &= ~0xff;
+	vcpu->run->s.regs.gprs[reg1] |= key;
+	return 0;
+}
+
+static int handle_rrbe(struct kvm_vcpu *vcpu)
+{
+	unsigned long addr;
+	int reg1, reg2;
+	int rc;
+
+	rc = try_handle_skey(vcpu);
+	if (rc)
+		return rc != -EAGAIN ? rc : 0;
+
+	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+	addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+	addr = kvm_s390_logical_to_effective(vcpu, addr);
+	addr = kvm_s390_real_to_abs(vcpu, addr);
+	addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+	if (kvm_is_error_hva(addr))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+	down_read(&current->mm->mmap_sem);
+	rc = reset_guest_reference_bit(current->mm, addr);
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+	kvm_s390_set_psw_cc(vcpu, rc);
+	return 0;
+}
+
+#define SSKE_NQ 0x8
+#define SSKE_MR 0x4
+#define SSKE_MC 0x2
+#define SSKE_MB 0x1
+static int handle_sske(struct kvm_vcpu *vcpu)
+{
+	unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
+	unsigned long start, end;
+	unsigned char key, oldkey;
+	int reg1, reg2;
+	int rc;
+
+	rc = try_handle_skey(vcpu);
+	if (rc)
+		return rc != -EAGAIN ? rc : 0;
+
+	if (!test_kvm_facility(vcpu->kvm, 8))
+		m3 &= ~SSKE_MB;
+	if (!test_kvm_facility(vcpu->kvm, 10))
+		m3 &= ~(SSKE_MC | SSKE_MR);
+	if (!test_kvm_facility(vcpu->kvm, 14))
+		m3 &= ~SSKE_NQ;
+
+	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+	key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+	start = kvm_s390_logical_to_effective(vcpu, start);
+	if (m3 & SSKE_MB) {
+		/* start already designates an absolute address */
+		end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+	} else {
+		start = kvm_s390_real_to_abs(vcpu, start);
+		end = start + PAGE_SIZE;
+	}
+
+	while (start != end) {
+		unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+
+		if (kvm_is_error_hva(addr))
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+		down_read(&current->mm->mmap_sem);
+		rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+						m3 & SSKE_NQ, m3 & SSKE_MR,
+						m3 & SSKE_MC);
+		up_read(&current->mm->mmap_sem);
+		if (rc < 0)
+			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		start += PAGE_SIZE;
+	};
+
+	if (m3 & (SSKE_MC | SSKE_MR)) {
+		if (m3 & SSKE_MB) {
+			/* skey in reg1 is unpredictable */
+			kvm_s390_set_psw_cc(vcpu, 3);
+		} else {
+			kvm_s390_set_psw_cc(vcpu, rc);
+			vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
+			vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+		}
+	}
+	if (m3 & SSKE_MB) {
+		if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+			vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
+		else
+			vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
+		end = kvm_s390_logical_to_effective(vcpu, end);
+		vcpu->run->s.regs.gprs[reg2] |= end;
+	}
 	return 0;
 }
 
@@ -586,9 +720,9 @@ static const intercept_handler_t b2_handlers[256] = {
 	[0x11] = handle_store_prefix,
 	[0x12] = handle_store_cpu_address,
 	[0x21] = handle_ipte_interlock,
-	[0x29] = handle_skey,
-	[0x2a] = handle_skey,
-	[0x2b] = handle_skey,
+	[0x29] = handle_iske,
+	[0x2a] = handle_rrbe,
+	[0x2b] = handle_sske,
 	[0x2c] = handle_test_block,
 	[0x30] = handle_io_inst,
 	[0x31] = handle_io_inst,
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index e791e8b27fd20b..fa286d0c0f2da3 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -572,6 +572,43 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(cond_set_guest_storage_key);
 
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
+{
+	spinlock_t *ptl;
+	pgste_t old, new;
+	pte_t *ptep;
+	int cc = 0;
+
+	ptep = get_locked_pte(mm, addr, &ptl);
+	if (unlikely(!ptep))
+		return -EFAULT;
+
+	new = old = pgste_get_lock(ptep);
+	/* Reset guest reference bit only */
+	pgste_val(new) &= ~PGSTE_GR_BIT;
+
+	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+		cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+		/* Merge real referenced bit into host-set */
+		pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
+	}
+	/* Reflect guest's logical view, not physical */
+	cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+	/* Changing the guest storage key is considered a change of the page */
+	if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+		pgste_val(new) |= PGSTE_UC_BIT;
+
+	pgste_set_unlock(ptep, new);
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char *key)
 {

From d40dd9e8da02a9905dea2329c0a8404ab8436622 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:04 +0100
Subject: [PATCH 057/302] MIPS: KVM: Drop unused guest_inst from kvm_vcpu_arch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MIPS kvm_vcpu_arch::guest_inst isn't used, so drop it from the
struct and drop its asm-offsets definition.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 1 -
 arch/mips/kernel/asm-offsets.c   | 2 --
 2 files changed, 3 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 36a391d289aa09..b310bb348443ed 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -347,7 +347,6 @@ struct kvm_vcpu_arch {
 	unsigned long host_cp0_cause;
 	unsigned long host_cp0_epc;
 	unsigned long host_cp0_entryhi;
-	uint32_t guest_inst;
 
 	/* GPRS */
 	unsigned long gprs[32];
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 1ea973b2abb1e0..4d96a9033f46aa 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -366,8 +366,6 @@ void output_kvm_defines(void)
 	OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
 	OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi);
 
-	OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst);
-
 	OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
 	OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
 	OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);

From e4e94c0fc8d66975f0822c52d04b366c6250dc64 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:05 +0100
Subject: [PATCH 058/302] MIPS: KVM: Drop unused host_cp0_entryhi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The host EntryHi in the KVM VCPU context is virtually unused. It gets
stored on exceptions, but only ever used in a kvm_debug() when a TLB
miss occurs.

Drop it entirely, removing that information from the kvm_debug output.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 1 -
 arch/mips/kernel/asm-offsets.c   | 1 -
 arch/mips/kvm/emulate.c          | 5 ++---
 arch/mips/kvm/locore.S           | 3 ---
 4 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b310bb348443ed..cbcedd7a684bd2 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -346,7 +346,6 @@ struct kvm_vcpu_arch {
 	unsigned long host_cp0_badvaddr;
 	unsigned long host_cp0_cause;
 	unsigned long host_cp0_epc;
-	unsigned long host_cp0_entryhi;
 
 	/* GPRS */
 	unsigned long gprs[32];
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 4d96a9033f46aa..420808899c7004 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -364,7 +364,6 @@ void output_kvm_defines(void)
 	OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
 	OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
 	OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
-	OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi);
 
 	OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
 	OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 645c8a1982a7b2..2836668d63fc1e 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1634,7 +1634,6 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
 						   (cop0) & KVM_ENTRYHI_ASID));
 
 		if (index < 0) {
-			vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
 			vcpu->arch.host_cp0_badvaddr = va;
 			vcpu->arch.pc = curr_pc;
 			er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
@@ -2576,8 +2575,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
 	unsigned long va = vcpu->arch.host_cp0_badvaddr;
 	int index;
 
-	kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx, entryhi: %#lx\n",
-		  vcpu->arch.host_cp0_badvaddr, vcpu->arch.host_cp0_entryhi);
+	kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx\n",
+		  vcpu->arch.host_cp0_badvaddr);
 
 	/*
 	 * KVM would not have got the exception if this entry was valid in the
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
index 828fcfc1cd7fe9..5ad2d507b1253a 100644
--- a/arch/mips/kvm/locore.S
+++ b/arch/mips/kvm/locore.S
@@ -308,9 +308,6 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
 	mfc0	k0, CP0_CAUSE
 	LONG_S	k0, VCPU_HOST_CP0_CAUSE(k1)
 
-	mfc0	k0, CP0_ENTRYHI
-	LONG_S	k0, VCPU_HOST_ENTRYHI(k1)
-
 	/* Now restore the host state just enough to run the handlers */
 
 	/* Switch EBASE to the one used by Linux */

From 2193c713799d90e616d8a2d814c15b69dfaa24e1 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:06 +0100
Subject: [PATCH 059/302] MIPS: KVM: Drop unused kvm_mips_sync_icache()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function kvm_mips_sync_icache() is unused, so lets remove it.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 2836668d63fc1e..6c2adcfd7faf1a 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1529,32 +1529,6 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
 	return er;
 }
 
-int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
-{
-	unsigned long offset = (va & ~PAGE_MASK);
-	struct kvm *kvm = vcpu->kvm;
-	unsigned long pa;
-	gfn_t gfn;
-	kvm_pfn_t pfn;
-
-	gfn = va >> PAGE_SHIFT;
-
-	if (gfn >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx\n", __func__, gfn);
-		kvm_mips_dump_host_tlbs();
-		kvm_arch_vcpu_dump_regs(vcpu);
-		return -1;
-	}
-	pfn = kvm->arch.guest_pmap[gfn];
-	pa = (pfn << PAGE_SHIFT) | offset;
-
-	kvm_debug("%s: va: %#lx, unmapped: %#x\n", __func__, va,
-		  CKSEG0ADDR(pa));
-
-	local_flush_icache_range(CKSEG0ADDR(pa), 32);
-	return 0;
-}
-
 enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
 					     uint32_t cause,
 					     struct kvm_run *run,

From bdb7ed8608f8f1944414abaffdecf3c997dfc41e Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:07 +0100
Subject: [PATCH 060/302] MIPS: KVM: Convert headers to kernel sized types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert the MIPS kvm_host.h structs, function declaration prototypes and
associated definition prototypes to use standard kernel sized types
(e.g. u32) instead of inttypes.h style ones (e.g. uint32_t).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 97 ++++++++++++++++----------------
 arch/mips/kvm/dyntrans.c         |  8 +--
 arch/mips/kvm/emulate.c          | 67 +++++++++++-----------
 arch/mips/kvm/interrupt.c        | 10 ++--
 arch/mips/kvm/interrupt.h        | 10 ++--
 arch/mips/kvm/tlb.c              |  8 +--
 6 files changed, 98 insertions(+), 102 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index cbcedd7a684bd2..9250b59acd1842 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -368,11 +368,11 @@ struct kvm_vcpu_arch {
 
 	struct hrtimer comparecount_timer;
 	/* Count timer control KVM register */
-	uint32_t count_ctl;
+	u32 count_ctl;
 	/* Count bias from the raw time */
-	uint32_t count_bias;
+	u32 count_bias;
 	/* Frequency of timer in Hz */
-	uint32_t count_hz;
+	u32 count_hz;
 	/* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */
 	s64 count_dyn_bias;
 	/* Resume time */
@@ -395,8 +395,8 @@ struct kvm_vcpu_arch {
 	struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
 
 	/* Cached guest kernel/user ASIDs */
-	uint32_t guest_user_asid[NR_CPUS];
-	uint32_t guest_kernel_asid[NR_CPUS];
+	u32 guest_user_asid[NR_CPUS];
+	u32 guest_kernel_asid[NR_CPUS];
 	struct mm_struct guest_kernel_mm, guest_user_mm;
 
 	int last_sched_cpu;
@@ -587,9 +587,9 @@ struct kvm_mips_callbacks {
 	void (*dequeue_io_int)(struct kvm_vcpu *vcpu,
 			       struct kvm_mips_interrupt *irq);
 	int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority,
-			   uint32_t cause);
+			   u32 cause);
 	int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority,
-			 uint32_t cause);
+			 u32 cause);
 	int (*get_one_reg)(struct kvm_vcpu *vcpu,
 			   const struct kvm_one_reg *reg, s64 *v);
 	int (*set_one_reg)(struct kvm_vcpu *vcpu,
@@ -620,11 +620,11 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu);
 void kvm_lose_fpu(struct kvm_vcpu *vcpu);
 
 /* TLB handling */
-uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
 
-uint32_t kvm_get_user_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 
-uint32_t kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
+u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
 					   struct kvm_vcpu *vcpu);
@@ -638,12 +638,12 @@ extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 						unsigned long *hpa1);
 
 extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
-						     uint32_t *opc,
+						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause,
-						    uint32_t *opc,
+						    u32 *opc,
 						    struct kvm_run *run,
 						    struct kvm_vcpu *vcpu);
 
@@ -665,90 +665,90 @@ extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
 
 /* Emulation */
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu);
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause);
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
 
 extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause,
-						   uint32_t *opc,
+						   u32 *opc,
 						   struct kvm_run *run,
 						   struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
-						      uint32_t *opc,
+						      u32 *opc,
 						      struct kvm_run *run,
 						      struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
-							 uint32_t *opc,
+							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
-							uint32_t *opc,
+							u32 *opc,
 							struct kvm_run *run,
 							struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
-							 uint32_t *opc,
+							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
-							uint32_t *opc,
+							u32 *opc,
 							struct kvm_run *run,
 							struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
-						     uint32_t *opc,
+						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
-						      uint32_t *opc,
+						      u32 *opc,
 						      struct kvm_run *run,
 						      struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_handle_ri(unsigned long cause,
-						uint32_t *opc,
+						u32 *opc,
 						struct kvm_run *run,
 						struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
-						     uint32_t *opc,
+						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
-						     uint32_t *opc,
+						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
-						       uint32_t *opc,
+						       u32 *opc,
 						       struct kvm_run *run,
 						       struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
-							 uint32_t *opc,
+							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
-						      uint32_t *opc,
+						      u32 *opc,
 						      struct kvm_run *run,
 						      struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
-							 uint32_t *opc,
+							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
 
 extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 							 struct kvm_run *run);
 
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
 void kvm_mips_init_count(struct kvm_vcpu *vcpu);
 int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
 int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
@@ -758,26 +758,26 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
 enum emulation_result kvm_mips_check_privilege(unsigned long cause,
-					       uint32_t *opc,
+					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu);
 
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst,
-					     uint32_t *opc,
-					     uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(u32 inst,
+					     u32 *opc,
+					     u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst,
-					   uint32_t *opc,
-					   uint32_t cause,
+enum emulation_result kvm_mips_emulate_CP0(u32 inst,
+					   u32 *opc,
+					   u32 cause,
 					   struct kvm_run *run,
 					   struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_store(uint32_t inst,
-					     uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(u32 inst,
+					     u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_load(uint32_t inst,
-					    uint32_t cause,
+enum emulation_result kvm_mips_emulate_load(u32 inst,
+					    u32 cause,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu);
 
@@ -787,14 +787,11 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
 /* Dynamic binary translation */
-extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_cache_index(u32 inst, u32 *opc,
 				      struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
-				   struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc,
-			       struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc,
-			       struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_cache_va(u32 inst, u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu);
 
 /* Misc */
 extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index f1527a465c1b1b..e79502a88534ff 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -28,7 +28,7 @@
 #define CLEAR_TEMPLATE  0x00000020
 #define SW_TEMPLATE     0xac000000
 
-int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_index(u32 inst, u32 *opc,
 			       struct kvm_vcpu *vcpu)
 {
 	int result = 0;
@@ -49,7 +49,7 @@ int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
  * Address based CACHE instructions are transformed into synci(s). A little
  * heavy for just D-cache invalidates, but avoids an expensive trap
  */
-int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_va(u32 inst, u32 *opc,
 			    struct kvm_vcpu *vcpu)
 {
 	int result = 0;
@@ -70,7 +70,7 @@ int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
 	return result;
 }
 
-int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 {
 	int32_t rt, rd, sel;
 	uint32_t mfc0_inst;
@@ -110,7 +110,7 @@ int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 {
 	int32_t rt, rd, sel;
 	uint32_t mtc0_inst = SW_TEMPLATE;
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 6c2adcfd7faf1a..c59c51908476f5 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -198,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 	return nextpc;
 }
 
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause)
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
 {
 	unsigned long branch_pc;
 	enum emulation_result er = EMULATE_DONE;
@@ -243,7 +243,7 @@ static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
  *
  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
  */
-static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
 {
 	s64 now_ns, periods;
 	u64 delta;
@@ -300,7 +300,7 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu)
  *
  * Returns:	The current value of the guest CP0_Count register.
  */
-static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	ktime_t expires, threshold;
@@ -360,7 +360,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
  *
  * Returns:	The current guest CP0_Count value.
  */
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 
@@ -387,8 +387,7 @@ uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
  *
  * Returns:	The ktime at the point of freeze.
  */
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
-				       uint32_t *count)
+static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
 {
 	ktime_t now;
 
@@ -419,7 +418,7 @@ static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
  * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
  */
 static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
-				    ktime_t now, uint32_t count)
+				    ktime_t now, u32 count)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	uint32_t compare;
@@ -444,7 +443,7 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
  *
  * Sets the CP0_Count value and updates the timer accordingly.
  */
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count)
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	ktime_t now;
@@ -538,7 +537,7 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz)
  * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
  * any pending timer interrupt is preserved.
  */
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	int dc;
@@ -973,8 +972,8 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu)
 	return mask;
 }
 
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
-					   uint32_t cause, struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
+					   struct kvm_run *run,
 					   struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -1312,7 +1311,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
@@ -1424,7 +1423,7 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
+enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu)
 {
@@ -1529,8 +1528,8 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
-					     uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc,
+					     u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
@@ -1687,7 +1686,7 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu)
 {
@@ -1735,7 +1734,7 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
 }
 
 enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
-					       uint32_t *opc,
+					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
 {
@@ -1770,7 +1769,7 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
-						  uint32_t *opc,
+						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
 {
@@ -1816,7 +1815,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
-						 uint32_t *opc,
+						 u32 *opc,
 						 struct kvm_run *run,
 						 struct kvm_vcpu *vcpu)
 {
@@ -1862,7 +1861,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
-						  uint32_t *opc,
+						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
 {
@@ -1906,7 +1905,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
-						 uint32_t *opc,
+						 u32 *opc,
 						 struct kvm_run *run,
 						 struct kvm_vcpu *vcpu)
 {
@@ -1950,7 +1949,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
 }
 
 /* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, u32 *opc,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
@@ -1979,7 +1978,7 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
 }
 
 enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
-					      uint32_t *opc,
+					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
 {
@@ -2022,7 +2021,7 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
-					       uint32_t *opc,
+					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
 {
@@ -2051,7 +2050,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
-					      uint32_t *opc,
+					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
 {
@@ -2086,7 +2085,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
-					      uint32_t *opc,
+					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
 {
@@ -2121,7 +2120,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
-						uint32_t *opc,
+						u32 *opc,
 						struct kvm_run *run,
 						struct kvm_vcpu *vcpu)
 {
@@ -2156,7 +2155,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
-						  uint32_t *opc,
+						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
 {
@@ -2191,7 +2190,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
-					       uint32_t *opc,
+					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
 {
@@ -2226,7 +2225,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
-						  uint32_t *opc,
+						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
 {
@@ -2275,7 +2274,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
 #define SYNC   0x0000000f
 #define RDHWR  0x0000003b
 
-enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_ri(unsigned long cause, u32 *opc,
 					 struct kvm_run *run,
 					 struct kvm_vcpu *vcpu)
 {
@@ -2406,7 +2405,7 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 }
 
 static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
-						  uint32_t *opc,
+						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
 {
@@ -2444,7 +2443,7 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
 }
 
 enum emulation_result kvm_mips_check_privilege(unsigned long cause,
-					       uint32_t *opc,
+					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
 {
@@ -2540,7 +2539,7 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
  *     case we inject the TLB from the Guest TLB into the shadow host TLB
  */
 enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
-					      uint32_t *opc,
+					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
 {
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index 95f790663b0c2a..49ce83237fc3b0 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -22,12 +22,12 @@
 
 #include "interrupt.h"
 
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
 {
 	set_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
 {
 	clear_bit(priority, &vcpu->arch.pending_exceptions);
 }
@@ -114,7 +114,7 @@ void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
 
 /* Deliver the interrupt of the corresponding priority, if possible. */
 int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-			    uint32_t cause)
+			    u32 cause)
 {
 	int allowed = 0;
 	uint32_t exccode;
@@ -196,12 +196,12 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
 }
 
 int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-			  uint32_t cause)
+			  u32 cause)
 {
 	return 1;
 }
 
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause)
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
 	unsigned long *pending_clr = &vcpu->arch.pending_exceptions_clr;
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index 2143884709e476..d661c100b2198e 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -37,8 +37,8 @@ extern char mips32_GuestException[], mips32_GuestExceptionEnd[];
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
 #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (0)
 
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
 int kvm_mips_pending_timer(struct kvm_vcpu *vcpu);
 
 void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu);
@@ -48,7 +48,7 @@ void kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu,
 void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
 				struct kvm_mips_interrupt *irq);
 int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-			    uint32_t cause);
+			    u32 cause);
 int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
-			  uint32_t cause);
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause);
+			  u32 cause);
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index ed021ae7867a79..7ea346e150a88f 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -47,7 +47,7 @@ EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean);
 bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn);
 
-uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
 	int cpu = smp_processor_id();
 
@@ -55,7 +55,7 @@ uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 			cpu_asid_mask(&cpu_data[cpu]);
 }
 
-uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 {
 	int cpu = smp_processor_id();
 
@@ -63,7 +63,7 @@ uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 			cpu_asid_mask(&cpu_data[cpu]);
 }
 
-inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
+inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
 {
 	return vcpu->kvm->arch.commpage_tlb;
 }
@@ -751,7 +751,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put);
 
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu)
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	unsigned long paddr, flags, vpn2, asid;

From 8cffd197485122632103a12fdada911242e7c01e Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:08 +0100
Subject: [PATCH 061/302] MIPS: KVM: Convert code to kernel sized types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert the MIPS KVM C code to use standard kernel sized types (e.g.
u32) instead of inttypes.h style ones (e.g. uint32_t) or other types as
appropriate.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/dyntrans.c  |  24 ++++-----
 arch/mips/kvm/emulate.c   | 104 +++++++++++++++++++-------------------
 arch/mips/kvm/interrupt.c |   2 +-
 arch/mips/kvm/mips.c      |   8 +--
 arch/mips/kvm/tlb.c       |  28 +++++-----
 arch/mips/kvm/trap_emul.c |  30 +++++------
 6 files changed, 98 insertions(+), 98 deletions(-)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index e79502a88534ff..d4a86fb239cdf4 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -33,13 +33,13 @@ int kvm_mips_trans_cache_index(u32 inst, u32 *opc,
 {
 	int result = 0;
 	unsigned long kseg0_opc;
-	uint32_t synci_inst = 0x0;
+	u32 synci_inst = 0x0;
 
 	/* Replace the CACHE instruction, with a NOP */
 	kseg0_opc =
 	    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
 		       (vcpu, (unsigned long) opc));
-	memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
+	memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32));
 	local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
 
 	return result;
@@ -54,7 +54,7 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc,
 {
 	int result = 0;
 	unsigned long kseg0_opc;
-	uint32_t synci_inst = SYNCI_TEMPLATE, base, offset;
+	u32 synci_inst = SYNCI_TEMPLATE, base, offset;
 
 	base = (inst >> 21) & 0x1f;
 	offset = inst & 0xffff;
@@ -64,7 +64,7 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc,
 	kseg0_opc =
 	    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
 		       (vcpu, (unsigned long) opc));
-	memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
+	memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32));
 	local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
 
 	return result;
@@ -72,8 +72,8 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc,
 
 int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 {
-	int32_t rt, rd, sel;
-	uint32_t mfc0_inst;
+	u32 rt, rd, sel;
+	u32 mfc0_inst;
 	unsigned long kseg0_opc, flags;
 
 	rt = (inst >> 16) & 0x1f;
@@ -94,11 +94,11 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 		kseg0_opc =
 		    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
 			       (vcpu, (unsigned long) opc));
-		memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t));
+		memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(u32));
 		local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
 	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
 		local_irq_save(flags);
-		memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t));
+		memcpy((void *)opc, (void *)&mfc0_inst, sizeof(u32));
 		local_flush_icache_range((unsigned long)opc,
 					 (unsigned long)opc + 32);
 		local_irq_restore(flags);
@@ -112,8 +112,8 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 
 int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 {
-	int32_t rt, rd, sel;
-	uint32_t mtc0_inst = SW_TEMPLATE;
+	u32 rt, rd, sel;
+	u32 mtc0_inst = SW_TEMPLATE;
 	unsigned long kseg0_opc, flags;
 
 	rt = (inst >> 16) & 0x1f;
@@ -127,11 +127,11 @@ int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 		kseg0_opc =
 		    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
 			       (vcpu, (unsigned long) opc));
-		memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t));
+		memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(u32));
 		local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
 	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
 		local_irq_save(flags);
-		memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t));
+		memcpy((void *)opc, (void *)&mtc0_inst, sizeof(u32));
 		local_flush_icache_range((unsigned long)opc,
 					 (unsigned long)opc + 32);
 		local_irq_restore(flags);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index c59c51908476f5..8f4f3242a65591 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -52,7 +52,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		goto unaligned;
 
 	/* Read the instruction */
-	insn.word = kvm_get_inst((uint32_t *) epc, vcpu);
+	insn.word = kvm_get_inst((u32 *) epc, vcpu);
 
 	if (insn.word == KVM_INVALID_INST)
 		return KVM_INVALID_INST;
@@ -304,7 +304,7 @@ static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	ktime_t expires, threshold;
-	uint32_t count, compare;
+	u32 count, compare;
 	int running;
 
 	/* Calculate the biased and scaled guest CP0_Count */
@@ -315,7 +315,7 @@ static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
 	 * Find whether CP0_Count has reached the closest timer interrupt. If
 	 * not, we shouldn't inject it.
 	 */
-	if ((int32_t)(count - compare) < 0)
+	if ((s32)(count - compare) < 0)
 		return count;
 
 	/*
@@ -421,13 +421,13 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
 				    ktime_t now, u32 count)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	uint32_t compare;
+	u32 compare;
 	u64 delta;
 	ktime_t expire;
 
 	/* Calculate timeout (wrap 0 to 2^32) */
 	compare = kvm_read_c0_guest_compare(cop0);
-	delta = (u64)(uint32_t)(compare - count - 1) + 1;
+	delta = (u64)(u32)(compare - count - 1) + 1;
 	delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
 	expire = ktime_add_ns(now, delta);
 
@@ -543,7 +543,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 	int dc;
 	u32 old_compare = kvm_read_c0_guest_compare(cop0);
 	ktime_t now;
-	uint32_t count;
+	u32 count;
 
 	/* if unchanged, must just be an ack */
 	if (old_compare == compare) {
@@ -584,7 +584,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	uint32_t count;
+	u32 count;
 	ktime_t now;
 
 	/* Stop hrtimer */
@@ -631,7 +631,7 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu)
 void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	uint32_t count;
+	u32 count;
 
 	kvm_clear_c0_guest_cause(cop0, CAUSEF_DC);
 
@@ -660,7 +660,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
 	s64 changed = count_ctl ^ vcpu->arch.count_ctl;
 	s64 delta;
 	ktime_t expire, now;
-	uint32_t count, compare;
+	u32 count, compare;
 
 	/* Only allow defined bits to be changed */
 	if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC))
@@ -686,7 +686,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
 			 */
 			count = kvm_read_c0_guest_count(cop0);
 			compare = kvm_read_c0_guest_compare(cop0);
-			delta = (u64)(uint32_t)(compare - count - 1) + 1;
+			delta = (u64)(u32)(compare - count - 1) + 1;
 			delta = div_u64(delta * NSEC_PER_SEC,
 					vcpu->arch.count_hz);
 			expire = ktime_add_ns(vcpu->arch.count_resume, delta);
@@ -800,9 +800,9 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	uint32_t pc = vcpu->arch.pc;
+	unsigned long pc = vcpu->arch.pc;
 
-	kvm_err("[%#x] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
+	kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
 	return EMULATE_FAIL;
 }
 
@@ -812,11 +812,11 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	int index = kvm_read_c0_guest_index(cop0);
 	struct kvm_mips_tlb *tlb = NULL;
-	uint32_t pc = vcpu->arch.pc;
+	unsigned long pc = vcpu->arch.pc;
 
 	if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
 		kvm_debug("%s: illegal index: %d\n", __func__, index);
-		kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+		kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
 			  pc, index, kvm_read_c0_guest_entryhi(cop0),
 			  kvm_read_c0_guest_entrylo0(cop0),
 			  kvm_read_c0_guest_entrylo1(cop0),
@@ -836,7 +836,7 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
 	tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
 	tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
 
-	kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+	kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
 		  pc, index, kvm_read_c0_guest_entryhi(cop0),
 		  kvm_read_c0_guest_entrylo0(cop0),
 		  kvm_read_c0_guest_entrylo1(cop0),
@@ -850,7 +850,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_mips_tlb *tlb = NULL;
-	uint32_t pc = vcpu->arch.pc;
+	unsigned long pc = vcpu->arch.pc;
 	int index;
 
 	get_random_bytes(&index, sizeof(index));
@@ -869,7 +869,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 	tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
 	tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
 
-	kvm_debug("[%#x] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
+	kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
 		  pc, index, kvm_read_c0_guest_entryhi(cop0),
 		  kvm_read_c0_guest_entrylo0(cop0),
 		  kvm_read_c0_guest_entrylo1(cop0));
@@ -881,14 +881,14 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	long entryhi = kvm_read_c0_guest_entryhi(cop0);
-	uint32_t pc = vcpu->arch.pc;
+	unsigned long pc = vcpu->arch.pc;
 	int index = -1;
 
 	index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
 
 	kvm_write_c0_guest_index(cop0, index);
 
-	kvm_debug("[%#x] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
+	kvm_debug("[%#lx] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
 		  index);
 
 	return EMULATE_DONE;
@@ -978,8 +978,8 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	enum emulation_result er = EMULATE_DONE;
-	int32_t rt, rd, copz, sel, co_bit, op;
-	uint32_t pc = vcpu->arch.pc;
+	u32 rt, rd, copz, sel, co_bit, op;
+	unsigned long pc = vcpu->arch.pc;
 	unsigned long curr_pc;
 
 	/*
@@ -1047,7 +1047,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 			}
 
 			kvm_debug
-			    ("[%#x] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n",
+			    ("[%#lx] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n",
 			     pc, rd, sel, rt, vcpu->arch.gprs[rt]);
 
 			break;
@@ -1077,7 +1077,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
 					kvm_read_c0_guest_ebase(cop0));
 			} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
-				uint32_t nasid =
+				u32 nasid =
 					vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
 				if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
 				    ((kvm_read_c0_guest_entryhi(cop0) &
@@ -1099,7 +1099,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
 				goto done;
 			} else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) {
-				kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n",
+				kvm_debug("[%#lx] MTCz, COMPARE %#lx <- %#lx\n",
 					  pc, kvm_read_c0_guest_compare(cop0),
 					  vcpu->arch.gprs[rt]);
 
@@ -1218,7 +1218,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 
 				kvm_write_c0_guest_config5(cop0, val);
 			} else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
-				uint32_t old_cause, new_cause;
+				u32 old_cause, new_cause;
 
 				old_cause = kvm_read_c0_guest_cause(cop0);
 				new_cause = vcpu->arch.gprs[rt];
@@ -1239,7 +1239,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 #endif
 			}
 
-			kvm_debug("[%#x] MTCz, cop0->reg[%d][%d]: %#lx\n", pc,
+			kvm_debug("[%#lx] MTCz, cop0->reg[%d][%d]: %#lx\n", pc,
 				  rd, sel, cop0->reg[rd][sel]);
 			break;
 
@@ -1271,9 +1271,8 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 
 		case wrpgpr_op:
 			{
-				uint32_t css =
-				    cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
-				uint32_t pss =
+				u32 css = cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
+				u32 pss =
 				    (cop0->reg[MIPS_CP0_STATUS][2] >> 6) & 0xf;
 				/*
 				 * We don't support any shadow register sets, so
@@ -1316,8 +1315,9 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 					     struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DO_MMIO;
-	int32_t op, base, rt, offset;
-	uint32_t bytes;
+	u32 op, base, rt;
+	s16 offset;
+	u32 bytes;
 	void *data = run->mmio.data;
 	unsigned long curr_pc;
 
@@ -1332,7 +1332,7 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 
 	rt = (inst >> 16) & 0x1f;
 	base = (inst >> 21) & 0x1f;
-	offset = inst & 0xffff;
+	offset = (s16)inst;
 	op = (inst >> 26) & 0x3f;
 
 	switch (op) {
@@ -1356,7 +1356,7 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 		*(u8 *) data = vcpu->arch.gprs[rt];
 		kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
 			  vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
-			  *(uint8_t *) data);
+			  *(u8 *) data);
 
 		break;
 
@@ -1378,11 +1378,11 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 		run->mmio.is_write = 1;
 		vcpu->mmio_needed = 1;
 		vcpu->mmio_is_write = 1;
-		*(uint32_t *) data = vcpu->arch.gprs[rt];
+		*(u32 *) data = vcpu->arch.gprs[rt];
 
 		kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
 			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-			  vcpu->arch.gprs[rt], *(uint32_t *) data);
+			  vcpu->arch.gprs[rt], *(u32 *) data);
 		break;
 
 	case sh_op:
@@ -1403,11 +1403,11 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 		run->mmio.is_write = 1;
 		vcpu->mmio_needed = 1;
 		vcpu->mmio_is_write = 1;
-		*(uint16_t *) data = vcpu->arch.gprs[rt];
+		*(u16 *) data = vcpu->arch.gprs[rt];
 
 		kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
 			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-			  vcpu->arch.gprs[rt], *(uint32_t *) data);
+			  vcpu->arch.gprs[rt], *(u32 *) data);
 		break;
 
 	default:
@@ -1428,12 +1428,13 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause,
 					    struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DO_MMIO;
-	int32_t op, base, rt, offset;
-	uint32_t bytes;
+	u32 op, base, rt;
+	s16 offset;
+	u32 bytes;
 
 	rt = (inst >> 16) & 0x1f;
 	base = (inst >> 21) & 0x1f;
-	offset = inst & 0xffff;
+	offset = (s16)inst;
 	op = (inst >> 26) & 0x3f;
 
 	vcpu->arch.pending_load_cause = cause;
@@ -1535,7 +1536,8 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc,
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	enum emulation_result er = EMULATE_DONE;
-	int32_t offset, cache, op_inst, op, base;
+	u32 cache, op_inst, op, base;
+	s16 offset;
 	struct kvm_vcpu_arch *arch = &vcpu->arch;
 	unsigned long va;
 	unsigned long curr_pc;
@@ -1551,7 +1553,7 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc,
 
 	base = (inst >> 21) & 0x1f;
 	op_inst = (inst >> 16) & 0x1f;
-	offset = (int16_t)inst;
+	offset = (s16)inst;
 	cache = op_inst & CacheOp_Cache;
 	op = op_inst & CacheOp_Op;
 
@@ -1691,7 +1693,7 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc,
 					    struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DONE;
-	uint32_t inst;
+	u32 inst;
 
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
@@ -2282,7 +2284,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, u32 *opc,
 	struct kvm_vcpu_arch *arch = &vcpu->arch;
 	enum emulation_result er = EMULATE_DONE;
 	unsigned long curr_pc;
-	uint32_t inst;
+	u32 inst;
 
 	/*
 	 * Update PC and hold onto current PC in case there is
@@ -2377,19 +2379,19 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 
 	switch (run->mmio.len) {
 	case 4:
-		*gpr = *(int32_t *) run->mmio.data;
+		*gpr = *(s32 *) run->mmio.data;
 		break;
 
 	case 2:
 		if (vcpu->mmio_needed == 2)
-			*gpr = *(int16_t *) run->mmio.data;
+			*gpr = *(s16 *) run->mmio.data;
 		else
-			*gpr = *(uint16_t *)run->mmio.data;
+			*gpr = *(u16 *)run->mmio.data;
 
 		break;
 	case 1:
 		if (vcpu->mmio_needed == 2)
-			*gpr = *(int8_t *) run->mmio.data;
+			*gpr = *(s8 *) run->mmio.data;
 		else
 			*gpr = *(u8 *) run->mmio.data;
 		break;
@@ -2409,7 +2411,7 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
 {
-	uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+	u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_vcpu_arch *arch = &vcpu->arch;
 	enum emulation_result er = EMULATE_DONE;
@@ -2448,7 +2450,7 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
 					       struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DONE;
-	uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+	u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 
 	int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
@@ -2544,7 +2546,7 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
 					      struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DONE;
-	uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+	u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
 	unsigned long va = vcpu->arch.host_cp0_badvaddr;
 	int index;
 
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index 49ce83237fc3b0..ad28dac6b7e955 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -117,7 +117,7 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
 			    u32 cause)
 {
 	int allowed = 0;
-	uint32_t exccode;
+	u32 exccode;
 
 	struct kvm_vcpu_arch *arch = &vcpu->arch;
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 44da5259f39027..a2b1b9205b943d 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1222,7 +1222,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 static void kvm_mips_set_c0_status(void)
 {
-	uint32_t status = read_c0_status();
+	u32 status = read_c0_status();
 
 	if (cpu_has_dsp)
 		status |= (ST0_MX);
@@ -1236,9 +1236,9 @@ static void kvm_mips_set_c0_status(void)
  */
 int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
-	uint32_t cause = vcpu->arch.host_cp0_cause;
-	uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 7ea346e150a88f..c4e11e13804267 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -32,8 +32,6 @@
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
-#define PRIx64 "llx"
-
 atomic_t kvm_mips_instance;
 EXPORT_SYMBOL_GPL(kvm_mips_instance);
 
@@ -102,13 +100,13 @@ void kvm_mips_dump_host_tlbs(void)
 		kvm_info("TLB%c%3d Hi 0x%08lx ",
 			 (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
 			 i, tlb.tlb_hi);
-		kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
-			 (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
+		kvm_info("Lo0=0x%09llx %c%c attr %lx ",
+			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
 			 (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
 			 (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
 			 (tlb.tlb_lo0 >> 3) & 7);
-		kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
-			 (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
+		kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
+			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
 			 (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
 			 (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
 			 (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
@@ -134,13 +132,13 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
 		kvm_info("TLB%c%3d Hi 0x%08lx ",
 			 (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
 			 i, tlb.tlb_hi);
-		kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
-			 (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
+		kvm_info("Lo0=0x%09llx %c%c attr %lx ",
+			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
 			 (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
 			 (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
 			 (tlb.tlb_lo0 >> 3) & 7);
-		kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
-			 (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
+		kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
+			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
 			 (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
 			 (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
 			 (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
@@ -160,7 +158,7 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
 	pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
 
 	if (kvm_mips_is_error_pfn(pfn)) {
-		kvm_err("Couldn't get pfn for gfn %#" PRIx64 "!\n", gfn);
+		kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
 		err = -EFAULT;
 		goto out;
 	}
@@ -176,7 +174,7 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
 						    unsigned long gva)
 {
 	gfn_t gfn;
-	uint32_t offset = gva & ~PAGE_MASK;
+	unsigned long offset = gva & ~PAGE_MASK;
 	struct kvm *kvm = vcpu->kvm;
 
 	if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
@@ -726,7 +724,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
-	uint32_t cpu;
+	int cpu;
 
 	local_irq_save(flags);
 
@@ -755,7 +753,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	unsigned long paddr, flags, vpn2, asid;
-	uint32_t inst;
+	u32 inst;
 	int index;
 
 	if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
@@ -787,7 +785,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 		paddr =
 		    kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
 							  (unsigned long) opc);
-		inst = *(uint32_t *) CKSEG0ADDR(paddr);
+		inst = *(u32 *) CKSEG0ADDR(paddr);
 	} else {
 		kvm_err("%s: illegal address: %p\n", __func__, opc);
 		return KVM_INVALID_INST;
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 6ba0fafcecbc9e..4aa5d77b0d6ad6 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -21,7 +21,7 @@
 static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 {
 	gpa_t gpa;
-	uint32_t kseg = KSEGX(gva);
+	gva_t kseg = KSEGX(gva);
 
 	if ((kseg == CKSEG0) || (kseg == CKSEG1))
 		gpa = CPHYSADDR(gva);
@@ -40,7 +40,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -87,7 +87,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
@@ -131,7 +131,7 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
@@ -178,7 +178,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
@@ -232,7 +232,7 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
@@ -262,7 +262,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
@@ -292,7 +292,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -310,7 +310,7 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -328,7 +328,7 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -346,7 +346,7 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *)vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -364,7 +364,7 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *)vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -382,7 +382,7 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *)vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -407,7 +407,7 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_run *run = vcpu->run;
-	uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
@@ -457,7 +457,7 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	uint32_t config1;
+	u32 config1;
 	int vcpu_id = vcpu->vcpu_id;
 
 	/*

From 31cf7498545c36cc992887bd6af17a496f26f681 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:09 +0100
Subject: [PATCH 062/302] MIPS: KVM: Make various Cause variables 32-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CP0 Cause register is passed around in KVM quite a bit, often as an
unsigned long, even though it is always 32-bits long.

Resize it to u32 throughout MIPS KVM.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 40 +++++++++++++++---------------
 arch/mips/kvm/emulate.c          | 38 ++++++++++++++---------------
 arch/mips/kvm/locore.S           |  2 +-
 arch/mips/kvm/trap_emul.c        | 42 ++++++++++++++++----------------
 4 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 9250b59acd1842..dceb49422e3b1c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -344,8 +344,8 @@ struct kvm_vcpu_arch {
 
 	/* Host CP0 registers used when handling exits from guest */
 	unsigned long host_cp0_badvaddr;
-	unsigned long host_cp0_cause;
 	unsigned long host_cp0_epc;
+	u32 host_cp0_cause;
 
 	/* GPRS */
 	unsigned long gprs[32];
@@ -386,7 +386,7 @@ struct kvm_vcpu_arch {
 	/* Bitmask of pending exceptions to be cleared */
 	unsigned long pending_exceptions_clr;
 
-	unsigned long pending_load_cause;
+	u32 pending_load_cause;
 
 	/* Save/Restore the entryhi register when are are preempted/scheduled back in */
 	unsigned long preempt_entryhi;
@@ -637,12 +637,12 @@ extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 						unsigned long *hpa0,
 						unsigned long *hpa1);
 
-extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
+extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause,
+extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
 						    u32 *opc,
 						    struct kvm_run *run,
 						    struct kvm_vcpu *vcpu);
@@ -668,77 +668,77 @@ extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
 u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
 
-extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
 						   u32 *opc,
 						   struct kvm_run *run,
 						   struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 						      u32 *opc,
 						      struct kvm_run *run,
 						      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
 							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
 							u32 *opc,
 							struct kvm_run *run,
 							struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
 							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
 							u32 *opc,
 							struct kvm_run *run,
 							struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
 						      u32 *opc,
 						      struct kvm_run *run,
 						      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_handle_ri(unsigned long cause,
+extern enum emulation_result kvm_mips_handle_ri(u32 cause,
 						u32 *opc,
 						struct kvm_run *run,
 						struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
 						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
 						     u32 *opc,
 						     struct kvm_run *run,
 						     struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
 						       u32 *opc,
 						       struct kvm_run *run,
 						       struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
 							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
 						      u32 *opc,
 						      struct kvm_run *run,
 						      struct kvm_vcpu *vcpu);
 
-extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
+extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
 							 u32 *opc,
 							 struct kvm_run *run,
 							 struct kvm_vcpu *vcpu);
@@ -757,7 +757,7 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
 void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 8f4f3242a65591..3baab5ec3d3b00 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1688,7 +1688,7 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc,
+enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu)
 {
@@ -1735,7 +1735,7 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
+enum emulation_result kvm_mips_emulate_syscall(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
@@ -1770,7 +1770,7 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
+enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
 						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
@@ -1816,7 +1816,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
 	return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
+enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
 						 u32 *opc,
 						 struct kvm_run *run,
 						 struct kvm_vcpu *vcpu)
@@ -1862,7 +1862,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
 	return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
+enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
 						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
@@ -1906,7 +1906,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
 	return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
+enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
 						 u32 *opc,
 						 struct kvm_run *run,
 						 struct kvm_vcpu *vcpu)
@@ -1951,7 +1951,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
 }
 
 /* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, u32 *opc,
+enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
@@ -1979,7 +1979,7 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, u32 *opc,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
+enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
@@ -2022,7 +2022,7 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
 	return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
+enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
@@ -2051,7 +2051,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
 	return EMULATE_DONE;
 }
 
-enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
+enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
@@ -2086,7 +2086,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
+enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
@@ -2121,7 +2121,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
+enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
 						u32 *opc,
 						struct kvm_run *run,
 						struct kvm_vcpu *vcpu)
@@ -2156,7 +2156,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
+enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
 						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
@@ -2191,7 +2191,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
+enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
@@ -2226,7 +2226,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
+enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
 						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
@@ -2276,7 +2276,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
 #define SYNC   0x0000000f
 #define RDHWR  0x0000003b
 
-enum emulation_result kvm_mips_handle_ri(unsigned long cause, u32 *opc,
+enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 					 struct kvm_run *run,
 					 struct kvm_vcpu *vcpu)
 {
@@ -2406,7 +2406,7 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 	return er;
 }
 
-static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
+static enum emulation_result kvm_mips_emulate_exc(u32 cause,
 						  u32 *opc,
 						  struct kvm_run *run,
 						  struct kvm_vcpu *vcpu)
@@ -2444,7 +2444,7 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu)
@@ -2540,7 +2540,7 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
  * (2) TLB entry is present in the Guest TLB but not in the shadow, in this
  *     case we inject the TLB from the Guest TLB into the shadow host TLB
  */
-enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
+enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 					      u32 *opc,
 					      struct kvm_run *run,
 					      struct kvm_vcpu *vcpu)
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
index 5ad2d507b1253a..43c8ef847efaaf 100644
--- a/arch/mips/kvm/locore.S
+++ b/arch/mips/kvm/locore.S
@@ -306,7 +306,7 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
 	LONG_S	k0, VCPU_HOST_CP0_BADVADDR(k1)
 
 	mfc0	k0, CP0_CAUSE
-	LONG_S	k0, VCPU_HOST_CP0_CAUSE(k1)
+	sw	k0, VCPU_HOST_CP0_CAUSE(k1)
 
 	/* Now restore the host state just enough to run the handlers */
 
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 4aa5d77b0d6ad6..ecf0068bc95eb0 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -41,7 +41,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -89,13 +89,13 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
 	if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
 	    || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-		kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
 			  cause, opc, badvaddr);
 		er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
 
@@ -111,14 +111,14 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 		 * when we are not using HIGHMEM. Need to address this in a
 		 * HIGHMEM kernel
 		 */
-		kvm_err("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		kvm_mips_dump_host_tlbs();
 		kvm_arch_vcpu_dump_regs(vcpu);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		ret = RESUME_HOST;
 	} else {
-		kvm_err("Illegal TLB Mod fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		kvm_mips_dump_host_tlbs();
 		kvm_arch_vcpu_dump_regs(vcpu);
@@ -133,7 +133,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -145,7 +145,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
 		}
 	} else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
 		   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-		kvm_debug("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_debug("USER ADDR TLB LD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
 			  cause, opc, badvaddr);
 		er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
 		if (er == EMULATE_DONE)
@@ -165,7 +165,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
 			ret = RESUME_HOST;
 		}
 	} else {
-		kvm_err("Illegal TLB LD fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_err("Illegal TLB LD fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		kvm_mips_dump_host_tlbs();
 		kvm_arch_vcpu_dump_regs(vcpu);
@@ -180,7 +180,7 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -219,7 +219,7 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
 			ret = RESUME_HOST;
 		}
 	} else {
-		kvm_err("Illegal TLB ST fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_err("Illegal TLB ST fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		kvm_mips_dump_host_tlbs();
 		kvm_arch_vcpu_dump_regs(vcpu);
@@ -234,7 +234,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -251,7 +251,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 			ret = RESUME_HOST;
 		}
 	} else {
-		kvm_err("Address Error (STORE): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		ret = RESUME_HOST;
@@ -264,7 +264,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
 	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -280,7 +280,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
 			ret = RESUME_HOST;
 		}
 	} else {
-		kvm_err("Address Error (LOAD): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
 			cause, opc, badvaddr);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		ret = RESUME_HOST;
@@ -293,7 +293,7 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -311,7 +311,7 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -329,7 +329,7 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -347,7 +347,7 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *)vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -365,7 +365,7 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *)vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -383,7 +383,7 @@ static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *)vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 
@@ -408,7 +408,7 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
-	unsigned long cause = vcpu->arch.host_cp0_cause;
+	u32 cause = vcpu->arch.host_cp0_cause;
 	enum emulation_result er = EMULATE_DONE;
 	int ret = RESUME_GUEST;
 

From 403015b323a297475919e1a8ccc1ceb0fcb85f5f Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:10 +0100
Subject: [PATCH 063/302] MIPS: KVM: Move non-TLB handling code out of tlb.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Various functions in tlb.c perform higher level MMU handling, but don't
strictly need to be statically built into the kernel as they don't
directly manipulate TLB entries. Move these functions out into a
separate mmu.c which will be built into the KVM kernel module. This
allows them to directly reference KVM functions in the KVM kernel module
in future.

Module exports of these functions have been removed, since they aren't
needed outside of KVM.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |   4 +
 arch/mips/kvm/Makefile           |   1 +
 arch/mips/kvm/mmu.c              | 380 +++++++++++++++++++++++++++++++
 arch/mips/kvm/tlb.c              | 364 +----------------------------
 4 files changed, 389 insertions(+), 360 deletions(-)
 create mode 100644 arch/mips/kvm/mmu.c

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index dceb49422e3b1c..f64be7987a3256 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -649,6 +649,10 @@ extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
 
 extern void kvm_mips_dump_host_tlbs(void);
 extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
+extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
+				   unsigned long entrylo0,
+				   unsigned long entrylo1,
+				   int flush_dcache_mask);
 extern void kvm_mips_flush_host_tlb(int skip_kseg0);
 extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
 
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 637ebbebd54970..0aabe40fcac9b7 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -10,6 +10,7 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
 	    interrupt.o stats.o commpage.o \
 	    dyntrans.o trap_emul.o fpu.o
+kvm-objs += mmu.o
 
 obj-$(CONFIG_KVM)	+= kvm.o
 obj-y			+= callback.o tlb.o
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
new file mode 100644
index 00000000000000..d18cadfd6e0159
--- /dev/null
+++ b/arch/mips/kvm/mmu.c
@@ -0,0 +1,380 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS MMU handling in the KVM module.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mmu_context.h>
+
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+
+	return vcpu->arch.guest_kernel_asid[cpu] &
+			cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+
+	return vcpu->arch.guest_user_asid[cpu] &
+			cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+{
+	int srcu_idx, err = 0;
+	kvm_pfn_t pfn;
+
+	if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
+		return 0;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
+
+	if (kvm_mips_is_error_pfn(pfn)) {
+		kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
+		err = -EFAULT;
+		goto out;
+	}
+
+	kvm->arch.guest_pmap[gfn] = pfn;
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return err;
+}
+
+/* Translate guest KSEG0 addresses to Host PA */
+unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
+						    unsigned long gva)
+{
+	gfn_t gfn;
+	unsigned long offset = gva & ~PAGE_MASK;
+	struct kvm *kvm = vcpu->kvm;
+
+	if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
+		kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
+			__builtin_return_address(0), gva);
+		return KVM_INVALID_PAGE;
+	}
+
+	gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
+
+	if (gfn >= kvm->arch.guest_pmap_npages) {
+		kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
+			gva);
+		return KVM_INVALID_PAGE;
+	}
+
+	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+		return KVM_INVALID_ADDR;
+
+	return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
+}
+
+/* XXXKYMA: Must be called with interrupts disabled */
+int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
+				    struct kvm_vcpu *vcpu)
+{
+	gfn_t gfn;
+	kvm_pfn_t pfn0, pfn1;
+	unsigned long vaddr = 0;
+	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+	int even;
+	struct kvm *kvm = vcpu->kvm;
+	const int flush_dcache_mask = 0;
+	int ret;
+
+	if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
+		kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
+		kvm_mips_dump_host_tlbs();
+		return -1;
+	}
+
+	gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
+	if (gfn >= kvm->arch.guest_pmap_npages) {
+		kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
+			gfn, badvaddr);
+		kvm_mips_dump_host_tlbs();
+		return -1;
+	}
+	even = !(gfn & 0x1);
+	vaddr = badvaddr & (PAGE_MASK << 1);
+
+	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+		return -1;
+
+	if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
+		return -1;
+
+	if (even) {
+		pfn0 = kvm->arch.guest_pmap[gfn];
+		pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1];
+	} else {
+		pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1];
+		pfn1 = kvm->arch.guest_pmap[gfn];
+	}
+
+	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
+		   (1 << 2) | (0x1 << 1);
+	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
+		   (1 << 2) | (0x1 << 1);
+
+	preempt_disable();
+	entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+				      flush_dcache_mask);
+	preempt_enable();
+
+	return ret;
+}
+
+int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
+					 struct kvm_mips_tlb *tlb,
+					 unsigned long *hpa0,
+					 unsigned long *hpa1)
+{
+	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+	struct kvm *kvm = vcpu->kvm;
+	kvm_pfn_t pfn0, pfn1;
+	int ret;
+
+	if ((tlb->tlb_hi & VPN2_MASK) == 0) {
+		pfn0 = 0;
+		pfn1 = 0;
+	} else {
+		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
+					   >> PAGE_SHIFT) < 0)
+			return -1;
+
+		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
+					   >> PAGE_SHIFT) < 0)
+			return -1;
+
+		pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
+					    >> PAGE_SHIFT];
+		pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
+					    >> PAGE_SHIFT];
+	}
+
+	if (hpa0)
+		*hpa0 = pfn0 << PAGE_SHIFT;
+
+	if (hpa1)
+		*hpa1 = pfn1 << PAGE_SHIFT;
+
+	/* Get attributes from the Guest TLB */
+	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
+		   (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
+	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
+		   (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
+
+	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
+		  tlb->tlb_lo0, tlb->tlb_lo1);
+
+	preempt_disable();
+	entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+					       kvm_mips_get_kernel_asid(vcpu) :
+					       kvm_mips_get_user_asid(vcpu));
+	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+				      tlb->tlb_mask);
+	preempt_enable();
+
+	return ret;
+}
+
+void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
+			     struct kvm_vcpu *vcpu)
+{
+	unsigned long asid = asid_cache(cpu);
+
+	asid += cpu_asid_inc();
+	if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
+		if (cpu_has_vtag_icache)
+			flush_icache_all();
+
+		kvm_local_flush_tlb_all();      /* start new asid cycle */
+
+		if (!asid)      /* fix version if needed */
+			asid = asid_first_version(cpu);
+	}
+
+	cpu_context(cpu, mm) = asid_cache(cpu) = asid;
+}
+
+/**
+ * kvm_mips_migrate_count() - Migrate timer.
+ * @vcpu:	Virtual CPU.
+ *
+ * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
+ * if it was running prior to being cancelled.
+ *
+ * Must be called when the VCPU is migrated to a different CPU to ensure that
+ * timer expiry during guest execution interrupts the guest and causes the
+ * interrupt to be delivered in a timely manner.
+ */
+static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
+{
+	if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
+		hrtimer_restart(&vcpu->arch.comparecount_timer);
+}
+
+/* Restore ASID once we are scheduled back after preemption */
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
+	unsigned long flags;
+	int newasid = 0;
+
+	kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
+
+	/* Allocate new kernel and user ASIDs if needed */
+
+	local_irq_save(flags);
+
+	if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
+						asid_version_mask(cpu)) {
+		kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
+		vcpu->arch.guest_kernel_asid[cpu] =
+		    vcpu->arch.guest_kernel_mm.context.asid[cpu];
+		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
+		vcpu->arch.guest_user_asid[cpu] =
+		    vcpu->arch.guest_user_mm.context.asid[cpu];
+		newasid++;
+
+		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+			  cpu_context(cpu, current->mm));
+		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+			  cpu, vcpu->arch.guest_kernel_asid[cpu]);
+		kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
+			  vcpu->arch.guest_user_asid[cpu]);
+	}
+
+	if (vcpu->arch.last_sched_cpu != cpu) {
+		kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
+			  vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
+		/*
+		 * Migrate the timer interrupt to the current CPU so that it
+		 * always interrupts the guest and synchronously triggers a
+		 * guest timer interrupt.
+		 */
+		kvm_mips_migrate_count(vcpu);
+	}
+
+	if (!newasid) {
+		/*
+		 * If we preempted while the guest was executing, then reload
+		 * the pre-empted ASID
+		 */
+		if (current->flags & PF_VCPU) {
+			write_c0_entryhi(vcpu->arch.
+					 preempt_entryhi & asid_mask);
+			ehb();
+		}
+	} else {
+		/* New ASIDs were allocated for the VM */
+
+		/*
+		 * Were we in guest context? If so then the pre-empted ASID is
+		 * no longer valid, we need to set it to what it should be based
+		 * on the mode of the Guest (Kernel/User)
+		 */
+		if (current->flags & PF_VCPU) {
+			if (KVM_GUEST_KERNEL_MODE(vcpu))
+				write_c0_entryhi(vcpu->arch.
+						 guest_kernel_asid[cpu] &
+						 asid_mask);
+			else
+				write_c0_entryhi(vcpu->arch.
+						 guest_user_asid[cpu] &
+						 asid_mask);
+			ehb();
+		}
+	}
+
+	/* restore guest state to registers */
+	kvm_mips_callbacks->vcpu_set_regs(vcpu);
+
+	local_irq_restore(flags);
+
+}
+
+/* ASID can change if another task is scheduled during preemption */
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+	int cpu;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+
+	vcpu->arch.preempt_entryhi = read_c0_entryhi();
+	vcpu->arch.last_sched_cpu = cpu;
+
+	/* save guest state in registers */
+	kvm_mips_callbacks->vcpu_get_regs(vcpu);
+
+	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+	     asid_version_mask(cpu))) {
+		kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
+			  cpu_context(cpu, current->mm));
+		drop_mmu_context(current->mm, cpu);
+	}
+	write_c0_entryhi(cpu_asid(cpu, current->mm));
+	ehb();
+
+	local_irq_restore(flags);
+}
+
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned long paddr, flags, vpn2, asid;
+	u32 inst;
+	int index;
+
+	if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
+	    KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
+		local_irq_save(flags);
+		index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc);
+		if (index >= 0) {
+			inst = *(opc);
+		} else {
+			vpn2 = (unsigned long) opc & VPN2_MASK;
+			asid = kvm_read_c0_guest_entryhi(cop0) &
+						KVM_ENTRYHI_ASID;
+			index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
+			if (index < 0) {
+				kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
+					__func__, opc, vcpu, read_c0_entryhi());
+				kvm_mips_dump_host_tlbs();
+				local_irq_restore(flags);
+				return KVM_INVALID_INST;
+			}
+			kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
+							     &vcpu->arch.
+							     guest_tlb[index],
+							     NULL, NULL);
+			inst = *(opc);
+		}
+		local_irq_restore(flags);
+	} else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
+		paddr =
+		    kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
+							  (unsigned long) opc);
+		inst = *(u32 *) CKSEG0ADDR(paddr);
+	} else {
+		kvm_err("%s: illegal address: %p\n", __func__, opc);
+		return KVM_INVALID_INST;
+	}
+
+	return inst;
+}
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index c4e11e13804267..373817c3166b35 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -14,7 +14,7 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kvm_host.h>
 #include <linux/srcu.h>
 
@@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean);
 bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn);
 
-u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
 	int cpu = smp_processor_id();
 
@@ -53,7 +53,7 @@ u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 			cpu_asid_mask(&cpu_data[cpu]);
 }
 
-u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
 {
 	int cpu = smp_processor_id();
 
@@ -146,58 +146,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
 
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
-{
-	int srcu_idx, err = 0;
-	kvm_pfn_t pfn;
-
-	if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
-		return 0;
-
-	srcu_idx = srcu_read_lock(&kvm->srcu);
-	pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
-
-	if (kvm_mips_is_error_pfn(pfn)) {
-		kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
-		err = -EFAULT;
-		goto out;
-	}
-
-	kvm->arch.guest_pmap[gfn] = pfn;
-out:
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	return err;
-}
-
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
-						    unsigned long gva)
-{
-	gfn_t gfn;
-	unsigned long offset = gva & ~PAGE_MASK;
-	struct kvm *kvm = vcpu->kvm;
-
-	if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
-		kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
-			__builtin_return_address(0), gva);
-		return KVM_INVALID_PAGE;
-	}
-
-	gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
-
-	if (gfn >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
-			gva);
-		return KVM_INVALID_PAGE;
-	}
-
-	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-		return KVM_INVALID_ADDR;
-
-	return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_translate_guest_kseg0_to_hpa);
-
 /* XXXKYMA: Must be called with interrupts disabled */
 /* set flush_dcache_mask == 0 if no dcache flush required */
 int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
@@ -261,64 +209,7 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
 	local_irq_restore(flags);
 	return 0;
 }
-
-/* XXXKYMA: Must be called with interrupts disabled */
-int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
-				    struct kvm_vcpu *vcpu)
-{
-	gfn_t gfn;
-	kvm_pfn_t pfn0, pfn1;
-	unsigned long vaddr = 0;
-	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-	int even;
-	struct kvm *kvm = vcpu->kvm;
-	const int flush_dcache_mask = 0;
-	int ret;
-
-	if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
-		kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		return -1;
-	}
-
-	gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
-	if (gfn >= kvm->arch.guest_pmap_npages) {
-		kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
-			gfn, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		return -1;
-	}
-	even = !(gfn & 0x1);
-	vaddr = badvaddr & (PAGE_MASK << 1);
-
-	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
-		return -1;
-
-	if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
-		return -1;
-
-	if (even) {
-		pfn0 = kvm->arch.guest_pmap[gfn];
-		pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1];
-	} else {
-		pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1];
-		pfn1 = kvm->arch.guest_pmap[gfn];
-	}
-
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-		   (1 << 2) | (0x1 << 1);
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-		   (1 << 2) | (0x1 << 1);
-
-	preempt_disable();
-	entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
-	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-				      flush_dcache_mask);
-	preempt_enable();
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
+EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
 
 int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	struct kvm_vcpu *vcpu)
@@ -363,61 +254,6 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 }
 EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
 
-int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-					 struct kvm_mips_tlb *tlb,
-					 unsigned long *hpa0,
-					 unsigned long *hpa1)
-{
-	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-	struct kvm *kvm = vcpu->kvm;
-	kvm_pfn_t pfn0, pfn1;
-	int ret;
-
-	if ((tlb->tlb_hi & VPN2_MASK) == 0) {
-		pfn0 = 0;
-		pfn1 = 0;
-	} else {
-		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
-					   >> PAGE_SHIFT) < 0)
-			return -1;
-
-		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
-					   >> PAGE_SHIFT) < 0)
-			return -1;
-
-		pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
-					    >> PAGE_SHIFT];
-		pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
-					    >> PAGE_SHIFT];
-	}
-
-	if (hpa0)
-		*hpa0 = pfn0 << PAGE_SHIFT;
-
-	if (hpa1)
-		*hpa1 = pfn1 << PAGE_SHIFT;
-
-	/* Get attributes from the Guest TLB */
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-		   (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-		   (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
-
-	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
-		  tlb->tlb_lo0, tlb->tlb_lo1);
-
-	preempt_disable();
-	entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
-					       kvm_mips_get_kernel_asid(vcpu) :
-					       kvm_mips_get_user_asid(vcpu));
-	ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
-				      tlb->tlb_mask);
-	preempt_enable();
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
-
 int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 {
 	int i;
@@ -574,25 +410,6 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
 }
 EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
 
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
-			     struct kvm_vcpu *vcpu)
-{
-	unsigned long asid = asid_cache(cpu);
-
-	asid += cpu_asid_inc();
-	if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
-		if (cpu_has_vtag_icache)
-			flush_icache_all();
-
-		kvm_local_flush_tlb_all();      /* start new asid cycle */
-
-		if (!asid)      /* fix version if needed */
-			asid = asid_first_version(cpu);
-	}
-
-	cpu_context(cpu, mm) = asid_cache(cpu) = asid;
-}
-
 void kvm_local_flush_tlb_all(void)
 {
 	unsigned long flags;
@@ -621,176 +438,3 @@ void kvm_local_flush_tlb_all(void)
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
-
-/**
- * kvm_mips_migrate_count() - Migrate timer.
- * @vcpu:	Virtual CPU.
- *
- * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
- * if it was running prior to being cancelled.
- *
- * Must be called when the VCPU is migrated to a different CPU to ensure that
- * timer expiry during guest execution interrupts the guest and causes the
- * interrupt to be delivered in a timely manner.
- */
-static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
-{
-	if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
-		hrtimer_restart(&vcpu->arch.comparecount_timer);
-}
-
-/* Restore ASID once we are scheduled back after preemption */
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
-	unsigned long flags;
-	int newasid = 0;
-
-	kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
-
-	/* Allocate new kernel and user ASIDs if needed */
-
-	local_irq_save(flags);
-
-	if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
-						asid_version_mask(cpu)) {
-		kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
-		vcpu->arch.guest_kernel_asid[cpu] =
-		    vcpu->arch.guest_kernel_mm.context.asid[cpu];
-		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
-		vcpu->arch.guest_user_asid[cpu] =
-		    vcpu->arch.guest_user_mm.context.asid[cpu];
-		newasid++;
-
-		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
-			  cpu_context(cpu, current->mm));
-		kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
-			  cpu, vcpu->arch.guest_kernel_asid[cpu]);
-		kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
-			  vcpu->arch.guest_user_asid[cpu]);
-	}
-
-	if (vcpu->arch.last_sched_cpu != cpu) {
-		kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
-			  vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
-		/*
-		 * Migrate the timer interrupt to the current CPU so that it
-		 * always interrupts the guest and synchronously triggers a
-		 * guest timer interrupt.
-		 */
-		kvm_mips_migrate_count(vcpu);
-	}
-
-	if (!newasid) {
-		/*
-		 * If we preempted while the guest was executing, then reload
-		 * the pre-empted ASID
-		 */
-		if (current->flags & PF_VCPU) {
-			write_c0_entryhi(vcpu->arch.
-					 preempt_entryhi & asid_mask);
-			ehb();
-		}
-	} else {
-		/* New ASIDs were allocated for the VM */
-
-		/*
-		 * Were we in guest context? If so then the pre-empted ASID is
-		 * no longer valid, we need to set it to what it should be based
-		 * on the mode of the Guest (Kernel/User)
-		 */
-		if (current->flags & PF_VCPU) {
-			if (KVM_GUEST_KERNEL_MODE(vcpu))
-				write_c0_entryhi(vcpu->arch.
-						 guest_kernel_asid[cpu] &
-						 asid_mask);
-			else
-				write_c0_entryhi(vcpu->arch.
-						 guest_user_asid[cpu] &
-						 asid_mask);
-			ehb();
-		}
-	}
-
-	/* restore guest state to registers */
-	kvm_mips_callbacks->vcpu_set_regs(vcpu);
-
-	local_irq_restore(flags);
-
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load);
-
-/* ASID can change if another task is scheduled during preemption */
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	unsigned long flags;
-	int cpu;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-
-	vcpu->arch.preempt_entryhi = read_c0_entryhi();
-	vcpu->arch.last_sched_cpu = cpu;
-
-	/* save guest state in registers */
-	kvm_mips_callbacks->vcpu_get_regs(vcpu);
-
-	if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
-	     asid_version_mask(cpu))) {
-		kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
-			  cpu_context(cpu, current->mm));
-		drop_mmu_context(current->mm, cpu);
-	}
-	write_c0_entryhi(cpu_asid(cpu, current->mm));
-	ehb();
-
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put);
-
-u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
-{
-	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	unsigned long paddr, flags, vpn2, asid;
-	u32 inst;
-	int index;
-
-	if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
-	    KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-		local_irq_save(flags);
-		index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc);
-		if (index >= 0) {
-			inst = *(opc);
-		} else {
-			vpn2 = (unsigned long) opc & VPN2_MASK;
-			asid = kvm_read_c0_guest_entryhi(cop0) &
-						KVM_ENTRYHI_ASID;
-			index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
-			if (index < 0) {
-				kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
-					__func__, opc, vcpu, read_c0_entryhi());
-				kvm_mips_dump_host_tlbs();
-				local_irq_restore(flags);
-				return KVM_INVALID_INST;
-			}
-			kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
-							     &vcpu->arch.
-							     guest_tlb[index],
-							     NULL, NULL);
-			inst = *(opc);
-		}
-		local_irq_restore(flags);
-	} else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-		paddr =
-		    kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
-							  (unsigned long) opc);
-		inst = *(u32 *) CKSEG0ADDR(paddr);
-	} else {
-		kvm_err("%s: illegal address: %p\n", __func__, opc);
-		return KVM_INVALID_INST;
-	}
-
-	return inst;
-}
-EXPORT_SYMBOL_GPL(kvm_get_inst);

From 9befad23ed3e2e178741cb84ac09c0ff45610537 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:11 +0100
Subject: [PATCH 064/302] MIPS: KVM: Don't indirect KVM functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several KVM module functions are indirected so that they can be accessed
from tlb.c which is statically built into the kernel. This is no longer
necessary as the relevant bits of code have moved into mmu.c which is
part of the KVM module, so drop the indirections.

Note: is_error_pfn() is defined inline in kvm_host.h, so didn't actually
require the KVM module to be loaded for it to work anyway.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |  3 ---
 arch/mips/kvm/mips.c             | 18 +-----------------
 arch/mips/kvm/mmu.c              |  4 ++--
 arch/mips/kvm/tlb.c              | 10 ----------
 4 files changed, 3 insertions(+), 32 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f64be7987a3256..c8f9671c2779a2 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -93,9 +93,6 @@
 #define KVM_INVALID_ADDR		0xdeadbeef
 
 extern atomic_t kvm_mips_instance;
-extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 
 struct kvm_vm_stat {
 	u32 remote_tlb_flush;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a2b1b9205b943d..c1ab6110ca1dcd 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -147,7 +147,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
 	/* Put the pages we reserved for the guest pmap */
 	for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
 		if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
-			kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]);
+			kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
 	}
 	kfree(kvm->arch.guest_pmap);
 
@@ -1645,18 +1645,6 @@ static int __init kvm_mips_init(void)
 
 	register_die_notifier(&kvm_mips_csr_die_notifier);
 
-	/*
-	 * On MIPS, kernel modules are executed from "mapped space", which
-	 * requires TLBs. The TLB handling code is statically linked with
-	 * the rest of the kernel (tlb.c) to avoid the possibility of
-	 * double faulting. The issue is that the TLB code references
-	 * routines that are part of the the KVM module, which are only
-	 * available once the module is loaded.
-	 */
-	kvm_mips_gfn_to_pfn = gfn_to_pfn;
-	kvm_mips_release_pfn_clean = kvm_release_pfn_clean;
-	kvm_mips_is_error_pfn = is_error_pfn;
-
 	return 0;
 }
 
@@ -1664,10 +1652,6 @@ static void __exit kvm_mips_exit(void)
 {
 	kvm_exit();
 
-	kvm_mips_gfn_to_pfn = NULL;
-	kvm_mips_release_pfn_clean = NULL;
-	kvm_mips_is_error_pfn = NULL;
-
 	unregister_die_notifier(&kvm_mips_csr_die_notifier);
 }
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index d18cadfd6e0159..d5ada83ec55cf4 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -37,9 +37,9 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
 		return 0;
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
+	pfn = gfn_to_pfn(kvm, gfn);
 
-	if (kvm_mips_is_error_pfn(pfn)) {
+	if (is_error_pfn(pfn)) {
 		kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
 		err = -EFAULT;
 		goto out;
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 373817c3166b35..37d77ad8431e3c 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -35,16 +35,6 @@
 atomic_t kvm_mips_instance;
 EXPORT_SYMBOL_GPL(kvm_mips_instance);
 
-/* These function pointers are initialized once the KVM module is loaded */
-kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-EXPORT_SYMBOL_GPL(kvm_mips_gfn_to_pfn);
-
-void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean);
-
-bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn);
-
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
 	int cpu = smp_processor_id();

From 021df206354cf1e1d341b66dee19ac250c9dc37d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:12 +0100
Subject: [PATCH 065/302] MIPS: KVM: Simplify even/odd TLB handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When handling TLB faults in the guest KSeg0 region, a pair of physical
addresses are read from the guest physical address map. However that
process is rather convoluted with an if/then/else statement. Simplify it
to just clear the lowest bit for the even entry and set the lowest bit
for the odd entry.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mmu.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index d5ada83ec55cf4..9924c1d253a7c6 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -87,7 +87,6 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	kvm_pfn_t pfn0, pfn1;
 	unsigned long vaddr = 0;
 	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
-	int even;
 	struct kvm *kvm = vcpu->kvm;
 	const int flush_dcache_mask = 0;
 	int ret;
@@ -105,7 +104,6 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 		kvm_mips_dump_host_tlbs();
 		return -1;
 	}
-	even = !(gfn & 0x1);
 	vaddr = badvaddr & (PAGE_MASK << 1);
 
 	if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
@@ -114,13 +112,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
 		return -1;
 
-	if (even) {
-		pfn0 = kvm->arch.guest_pmap[gfn];
-		pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1];
-	} else {
-		pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1];
-		pfn1 = kvm->arch.guest_pmap[gfn];
-	}
+	pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
+	pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
 
 	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
 		   (1 << 2) | (0x1 << 1);

From 26ee17ff71d3def831bfa4f6851ed1ba789e24f6 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:13 +0100
Subject: [PATCH 066/302] MIPS: KVM: Drop unused hpa0/hpa1 args from function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function kvm_mips_handle_mapped_seg_tlb_fault() has two completely
unused pointer arguments, hpa0 and hpa1, for which all users always pass
NULL.

Drop these two arguments and update the callers.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |  4 +---
 arch/mips/kvm/emulate.c          |  7 ++-----
 arch/mips/kvm/mmu.c              | 13 ++-----------
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index c8f9671c2779a2..f68293b4a598f8 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -630,9 +630,7 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 					      struct kvm_vcpu *vcpu);
 
 extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-						struct kvm_mips_tlb *tlb,
-						unsigned long *hpa0,
-						unsigned long *hpa1);
+						struct kvm_mips_tlb *tlb);
 
 extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 						     u32 *opc,
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 3baab5ec3d3b00..fb77fb46977609 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1633,9 +1633,7 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc,
 				 * We fault an entry from the guest tlb to the
 				 * shadow host TLB
 				 */
-				kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
-								     NULL,
-								     NULL);
+				kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
 			}
 		}
 	} else {
@@ -2599,8 +2597,7 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 			 * OK we have a Guest TLB entry, now inject it into the
 			 * shadow host TLB
 			 */
-			kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL,
-							     NULL);
+			kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
 		}
 	}
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 9924c1d253a7c6..4d42b47b500b8e 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -130,9 +130,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 }
 
 int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
-					 struct kvm_mips_tlb *tlb,
-					 unsigned long *hpa0,
-					 unsigned long *hpa1)
+					 struct kvm_mips_tlb *tlb)
 {
 	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
 	struct kvm *kvm = vcpu->kvm;
@@ -157,12 +155,6 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 					    >> PAGE_SHIFT];
 	}
 
-	if (hpa0)
-		*hpa0 = pfn0 << PAGE_SHIFT;
-
-	if (hpa1)
-		*hpa1 = pfn1 << PAGE_SHIFT;
-
 	/* Get attributes from the Guest TLB */
 	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
 		   (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
@@ -354,8 +346,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 			}
 			kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
 							     &vcpu->arch.
-							     guest_tlb[index],
-							     NULL, NULL);
+							     guest_tlb[index]);
 			inst = *(opc);
 		}
 		local_irq_restore(flags);

From 878edf014e29de38c49153aba20273fbc9ae31af Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:14 +0100
Subject: [PATCH 067/302] MIPS: KVM: Restore host EBase from ebase variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The host kernel's exception vector base address is currently saved in
the VCPU structure at creation time, and restored on a guest exit.
However it doesn't change and can already be easily accessed from the
'ebase' variable (arch/mips/kernel/traps.c), so drop the host_ebase
member of kvm_vcpu_arch, export the 'ebase' variable to modules and load
from there instead.

This does result in a single extra instruction (lui) on the guest exit
path, but simplifies the code a bit and removes the redundant storage of
the host exception base address.

Credit for the idea goes to Cavium's VZ KVM implementation.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 2 +-
 arch/mips/kernel/asm-offsets.c   | 1 -
 arch/mips/kernel/traps.c         | 1 +
 arch/mips/kvm/locore.S           | 2 +-
 arch/mips/kvm/mips.c             | 3 ---
 5 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f68293b4a598f8..24a8e557db887f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -334,7 +334,7 @@ struct kvm_mips_tlb {
 
 #define KVM_MIPS_GUEST_TLB_SIZE	64
 struct kvm_vcpu_arch {
-	void *host_ebase, *guest_ebase;
+	void *guest_ebase;
 	int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 	unsigned long host_stack;
 	unsigned long host_gp;
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 420808899c7004..a1263d188a5a8c 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -355,7 +355,6 @@ void output_kvm_defines(void)
 	OFFSET(VCPU_RUN, kvm_vcpu, run);
 	OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
 
-	OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase);
 	OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
 
 	OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 4a1712b5abdff6..66e5820bfdae43 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1859,6 +1859,7 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs)
 #define VECTORSPACING 0x100	/* for EI/VI mode */
 
 unsigned long ebase;
+EXPORT_SYMBOL_GPL(ebase);
 unsigned long exception_handlers[32];
 unsigned long vi_handlers[64];
 
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
index 43c8ef847efaaf..f87bec546366f2 100644
--- a/arch/mips/kvm/locore.S
+++ b/arch/mips/kvm/locore.S
@@ -319,7 +319,7 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
 	mtc0	k0, CP0_STATUS
 	ehb
 
-	LONG_L	k0, VCPU_HOST_EBASE(k1)
+	LONG_L	k0, ebase
 	mtc0	k0,CP0_EBASE
 
 	/*
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index c1ab6110ca1dcd..6e753761b5d6e5 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -273,9 +273,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	else
 		size = 0x4000;
 
-	/* Save Linux EBASE */
-	vcpu->arch.host_ebase = (void *)read_c0_ebase();
-
 	gebase = kzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
 
 	if (!gebase) {

From 138f7ad916760a7c263678ce06545a0cfc98bf97 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:15 +0100
Subject: [PATCH 068/302] MIPS: KVM: Clean up TLB management hazards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM's host TLB handling routines were using tlbw hazard barrier macros
around tlb_read(). Now that hazard barrier macros exist for tlbr, update
this case to use them.

Also fix various other unnecessary hazard barriers in this code.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/tlb.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 37d77ad8431e3c..d3000680df1ff0 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -195,7 +195,6 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
 	/* Restore old ASID */
 	write_c0_entryhi(old_entryhi);
 	mtc0_tlbw_hazard();
-	tlbw_use_hazard();
 	local_irq_restore(flags);
 	return 0;
 }
@@ -219,15 +218,11 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	old_entryhi = read_c0_entryhi();
 	vaddr = badvaddr & (PAGE_MASK << 1);
 	write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
-	mtc0_tlbw_hazard();
 	write_c0_entrylo0(entrylo0);
-	mtc0_tlbw_hazard();
 	write_c0_entrylo1(entrylo1);
-	mtc0_tlbw_hazard();
 	write_c0_index(kvm_mips_get_commpage_asid(vcpu));
 	mtc0_tlbw_hazard();
 	tlb_write_indexed();
-	mtc0_tlbw_hazard();
 	tlbw_use_hazard();
 
 	kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
@@ -237,7 +232,6 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	/* Restore old ASID */
 	write_c0_entryhi(old_entryhi);
 	mtc0_tlbw_hazard();
-	tlbw_use_hazard();
 	local_irq_restore(flags);
 
 	return 0;
@@ -291,7 +285,6 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
 	/* Restore old ASID */
 	write_c0_entryhi(old_entryhi);
 	mtc0_tlbw_hazard();
-	tlbw_use_hazard();
 
 	local_irq_restore(flags);
 
@@ -322,21 +315,16 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
 
 	if (idx > 0) {
 		write_c0_entryhi(UNIQUE_ENTRYHI(idx));
-		mtc0_tlbw_hazard();
-
 		write_c0_entrylo0(0);
-		mtc0_tlbw_hazard();
-
 		write_c0_entrylo1(0);
 		mtc0_tlbw_hazard();
 
 		tlb_write_indexed();
-		mtc0_tlbw_hazard();
+		tlbw_use_hazard();
 	}
 
 	write_c0_entryhi(old_entryhi);
 	mtc0_tlbw_hazard();
-	tlbw_use_hazard();
 
 	local_irq_restore(flags);
 
@@ -364,11 +352,11 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
 	/* Blast 'em all away. */
 	for (entry = 0; entry < maxentry; entry++) {
 		write_c0_index(entry);
-		mtc0_tlbw_hazard();
 
 		if (skip_kseg0) {
+			mtc0_tlbr_hazard();
 			tlb_read();
-			tlbw_use_hazard();
+			tlb_read_hazard();
 
 			entryhi = read_c0_entryhi();
 
@@ -379,22 +367,17 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
 
 		/* Make sure all entries differ. */
 		write_c0_entryhi(UNIQUE_ENTRYHI(entry));
-		mtc0_tlbw_hazard();
 		write_c0_entrylo0(0);
-		mtc0_tlbw_hazard();
 		write_c0_entrylo1(0);
 		mtc0_tlbw_hazard();
 
 		tlb_write_indexed();
-		mtc0_tlbw_hazard();
+		tlbw_use_hazard();
 	}
 
-	tlbw_use_hazard();
-
 	write_c0_entryhi(old_entryhi);
 	write_c0_pagemask(old_pagemask);
 	mtc0_tlbw_hazard();
-	tlbw_use_hazard();
 
 	local_irq_restore(flags);
 }
@@ -419,9 +402,9 @@ void kvm_local_flush_tlb_all(void)
 		write_c0_index(entry);
 		mtc0_tlbw_hazard();
 		tlb_write_indexed();
+		tlbw_use_hazard();
 		entry++;
 	}
-	tlbw_use_hazard();
 	write_c0_entryhi(old_ctx);
 	mtc0_tlbw_hazard();
 

From e922a4cb71e745e53e64446d792c4603df43643a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:16 +0100
Subject: [PATCH 069/302] MIPS: KVM: Use dump_tlb_all() for
 kvm_mips_dump_host_tlbs()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM implements its own routine for dumping the host TLB entries, but we
already have dump_tlb_all() which does something very similar (although
it only prints out TLB entries which match the current ASID or are
global).

Make KVM use dump_tlb_all() along with dump_tlb_regs() to avoid the
duplication and inevitable bitrot, allowing TLB dumping enhancements
(e.g. for VZ and GuestIDs) to be made in a single place.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/tlb.c | 42 ++++--------------------------------------
 1 file changed, 4 insertions(+), 38 deletions(-)

diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index d3000680df1ff0..c0b8e3fc895e91 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -24,6 +24,7 @@
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
+#include <asm/tlbdebug.h>
 
 #undef CONFIG_MIPS_MT
 #include <asm/r4kcache.h>
@@ -60,50 +61,15 @@ inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
 
 void kvm_mips_dump_host_tlbs(void)
 {
-	unsigned long old_entryhi;
-	unsigned long old_pagemask;
-	struct kvm_mips_tlb tlb;
 	unsigned long flags;
-	int i;
 
 	local_irq_save(flags);
 
-	old_entryhi = read_c0_entryhi();
-	old_pagemask = read_c0_pagemask();
-
 	kvm_info("HOST TLBs:\n");
-	kvm_info("ASID: %#lx\n", read_c0_entryhi() &
-		 cpu_asid_mask(&current_cpu_data));
+	dump_tlb_regs();
+	pr_info("\n");
+	dump_tlb_all();
 
-	for (i = 0; i < current_cpu_data.tlbsize; i++) {
-		write_c0_index(i);
-		mtc0_tlbw_hazard();
-
-		tlb_read();
-		tlbw_use_hazard();
-
-		tlb.tlb_hi = read_c0_entryhi();
-		tlb.tlb_lo0 = read_c0_entrylo0();
-		tlb.tlb_lo1 = read_c0_entrylo1();
-		tlb.tlb_mask = read_c0_pagemask();
-
-		kvm_info("TLB%c%3d Hi 0x%08lx ",
-			 (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
-			 i, tlb.tlb_hi);
-		kvm_info("Lo0=0x%09llx %c%c attr %lx ",
-			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
-			 (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
-			 (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
-			 (tlb.tlb_lo0 >> 3) & 7);
-		kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
-			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
-			 (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
-			 (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
-			 (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
-	}
-	write_c0_entryhi(old_entryhi);
-	write_c0_pagemask(old_pagemask);
-	mtc0_tlbw_hazard();
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_host_tlbs);

From 9fbfb06a4065772571aa58d2583868268fc8be53 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:17 +0100
Subject: [PATCH 070/302] MIPS: KVM: Arrayify struct kvm_mips_tlb::tlb_lo*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The values of the EntryLo0 and EntryLo1 registers for a TLB entry are
stored in separate members of struct kvm_mips_tlb called tlb_lo0 and
tlb_lo1 respectively. To allow future code which needs to manipulate
arbitrary EntryLo data in the TLB entry to be simpler and less
conditional, replace these members with an array of two elements.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 11 +++++------
 arch/mips/kvm/emulate.c          | 10 +++++-----
 arch/mips/kvm/mmu.c              | 20 +++++++++++---------
 arch/mips/kvm/tlb.c              | 21 +++++++++++----------
 4 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 24a8e557db887f..2d15da111ba8e2 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -310,13 +310,13 @@ enum emulation_result {
 
 #define VPN2_MASK		0xffffe000
 #define KVM_ENTRYHI_ASID	MIPS_ENTRYHI_ASID
-#define TLB_IS_GLOBAL(x)	(((x).tlb_lo0 & MIPS3_PG_G) &&		\
-				 ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_IS_GLOBAL(x)	(((x).tlb_lo[0] & MIPS3_PG_G) &&	\
+				 ((x).tlb_lo[1] & MIPS3_PG_G))
 #define TLB_VPN2(x)		((x).tlb_hi & VPN2_MASK)
 #define TLB_ASID(x)		((x).tlb_hi & KVM_ENTRYHI_ASID)
 #define TLB_IS_VALID(x, va)	(((va) & (1 << PAGE_SHIFT))		\
-				 ? ((x).tlb_lo1 & MIPS3_PG_V)		\
-				 : ((x).tlb_lo0 & MIPS3_PG_V))
+				 ? ((x).tlb_lo[1] & MIPS3_PG_V)		\
+				 : ((x).tlb_lo[0] & MIPS3_PG_V))
 #define TLB_HI_VPN2_HIT(x, y)	((TLB_VPN2(x) & ~(x).tlb_mask) ==	\
 				 ((y) & VPN2_MASK & ~(x).tlb_mask))
 #define TLB_HI_ASID_HIT(x, y)	(TLB_IS_GLOBAL(x) ||			\
@@ -325,8 +325,7 @@ enum emulation_result {
 struct kvm_mips_tlb {
 	long tlb_mask;
 	long tlb_hi;
-	long tlb_lo0;
-	long tlb_lo1;
+	long tlb_lo[2];
 };
 
 #define KVM_MIPS_FPU_FPU	0x1
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index fb77fb46977609..5b89c08034052a 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -833,8 +833,8 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
 
 	tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
 	tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
-	tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
-	tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+	tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+	tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
 
 	kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
 		  pc, index, kvm_read_c0_guest_entryhi(cop0),
@@ -866,8 +866,8 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
 
 	tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
 	tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
-	tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
-	tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+	tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+	tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
 
 	kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
 		  pc, index, kvm_read_c0_guest_entryhi(cop0),
@@ -2592,7 +2592,7 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 			}
 		} else {
 			kvm_debug("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n",
-				  tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1);
+				  tlb->tlb_hi, tlb->tlb_lo[0], tlb->tlb_lo[1]);
 			/*
 			 * OK we have a Guest TLB entry, now inject it into the
 			 * shadow host TLB
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 4d42b47b500b8e..14996aa5e7c4f9 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -141,28 +141,30 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 		pfn0 = 0;
 		pfn1 = 0;
 	} else {
-		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
+		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0])
 					   >> PAGE_SHIFT) < 0)
 			return -1;
 
-		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
+		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1])
 					   >> PAGE_SHIFT) < 0)
 			return -1;
 
-		pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
-					    >> PAGE_SHIFT];
-		pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
-					    >> PAGE_SHIFT];
+		pfn0 = kvm->arch.guest_pmap[
+			mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT];
+		pfn1 = kvm->arch.guest_pmap[
+			mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT];
 	}
 
 	/* Get attributes from the Guest TLB */
 	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-		   (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
+		   (tlb->tlb_lo[0] & MIPS3_PG_D) |
+		   (tlb->tlb_lo[0] & MIPS3_PG_V);
 	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-		   (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
+		   (tlb->tlb_lo[1] & MIPS3_PG_D) |
+		   (tlb->tlb_lo[1] & MIPS3_PG_V);
 
 	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
-		  tlb->tlb_lo0, tlb->tlb_lo1);
+		  tlb->tlb_lo[0], tlb->tlb_lo[1]);
 
 	preempt_disable();
 	entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index c0b8e3fc895e91..4825d0dbb65eee 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -86,18 +86,19 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
 	for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) {
 		tlb = vcpu->arch.guest_tlb[i];
 		kvm_info("TLB%c%3d Hi 0x%08lx ",
-			 (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
+			 (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & MIPS3_PG_V
+							? ' ' : '*',
 			 i, tlb.tlb_hi);
 		kvm_info("Lo0=0x%09llx %c%c attr %lx ",
-			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
-			 (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
-			 (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
-			 (tlb.tlb_lo0 >> 3) & 7);
+			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]),
+			 (tlb.tlb_lo[0] & MIPS3_PG_D) ? 'D' : ' ',
+			 (tlb.tlb_lo[0] & MIPS3_PG_G) ? 'G' : ' ',
+			 (tlb.tlb_lo[0] >> 3) & 7);
 		kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
-			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
-			 (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
-			 (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
-			 (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
+			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]),
+			 (tlb.tlb_lo[1] & MIPS3_PG_D) ? 'D' : ' ',
+			 (tlb.tlb_lo[1] & MIPS3_PG_G) ? 'G' : ' ',
+			 (tlb.tlb_lo[1] >> 3) & 7, tlb.tlb_mask);
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
@@ -219,7 +220,7 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
 	}
 
 	kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n",
-		  __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1);
+		  __func__, entryhi, index, tlb[i].tlb_lo[0], tlb[i].tlb_lo[1]);
 
 	return index;
 }

From 19d194c62b25cafaf64a5fe74305b3e9b84d63d8 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:18 +0100
Subject: [PATCH 071/302] MIPS: KVM: Simplify TLB_* macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify some of the TLB_ macros making use of the arrayification of
tlb_lo. Basically we index the array by the bit of the virtual address
which determines whether the even or odd entry is used, instead of
having a conditional.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 2d15da111ba8e2..83a3212b956d5b 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -310,13 +310,11 @@ enum emulation_result {
 
 #define VPN2_MASK		0xffffe000
 #define KVM_ENTRYHI_ASID	MIPS_ENTRYHI_ASID
-#define TLB_IS_GLOBAL(x)	(((x).tlb_lo[0] & MIPS3_PG_G) &&	\
-				 ((x).tlb_lo[1] & MIPS3_PG_G))
+#define TLB_IS_GLOBAL(x)	((x).tlb_lo[0] & (x).tlb_lo[1] & MIPS3_PG_G)
 #define TLB_VPN2(x)		((x).tlb_hi & VPN2_MASK)
 #define TLB_ASID(x)		((x).tlb_hi & KVM_ENTRYHI_ASID)
-#define TLB_IS_VALID(x, va)	(((va) & (1 << PAGE_SHIFT))		\
-				 ? ((x).tlb_lo[1] & MIPS3_PG_V)		\
-				 : ((x).tlb_lo[0] & MIPS3_PG_V))
+#define TLB_LO_IDX(x, va)	(((va) >> PAGE_SHIFT) & 1)
+#define TLB_IS_VALID(x, va)	((x).tlb_lo[TLB_LO_IDX(x, va)] & MIPS3_PG_V)
 #define TLB_HI_VPN2_HIT(x, y)	((TLB_VPN2(x) & ~(x).tlb_mask) ==	\
 				 ((y) & VPN2_MASK & ~(x).tlb_mask))
 #define TLB_HI_ASID_HIT(x, y)	(TLB_IS_GLOBAL(x) ||			\

From e6207bbea16c60942cdc1492af4feed5aed77389 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:19 +0100
Subject: [PATCH 072/302] MIPS: KVM: Use MIPS_ENTRYLO_* defs from mipsregs.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert KVM to use the MIPS_ENTRYLO_* definitions from <asm/mipsregs.h>
rather than custom definitions in kvm_host.h

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 11 ++++-------
 arch/mips/kvm/mmu.c              | 22 ++++++++++++----------
 arch/mips/kvm/tlb.c              | 23 ++++++++++++-----------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 83a3212b956d5b..d0432b5f23434c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -19,6 +19,8 @@
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 
+#include <asm/mipsregs.h>
+
 /* MIPS KVM register ids */
 #define MIPS_CP0_32(_R, _S)					\
 	(KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S)))
@@ -295,11 +297,6 @@ enum emulation_result {
 	EMULATE_PRIV_FAIL,
 };
 
-#define MIPS3_PG_G	0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V	0x00000002 /* Valid */
-#define MIPS3_PG_NV	0x00000000
-#define MIPS3_PG_D	0x00000004 /* Dirty */
-
 #define mips3_paddr_to_tlbpfn(x) \
 	(((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
 #define mips3_tlbpfn_to_paddr(x) \
@@ -310,11 +307,11 @@ enum emulation_result {
 
 #define VPN2_MASK		0xffffe000
 #define KVM_ENTRYHI_ASID	MIPS_ENTRYHI_ASID
-#define TLB_IS_GLOBAL(x)	((x).tlb_lo[0] & (x).tlb_lo[1] & MIPS3_PG_G)
+#define TLB_IS_GLOBAL(x)	((x).tlb_lo[0] & (x).tlb_lo[1] & ENTRYLO_G)
 #define TLB_VPN2(x)		((x).tlb_hi & VPN2_MASK)
 #define TLB_ASID(x)		((x).tlb_hi & KVM_ENTRYHI_ASID)
 #define TLB_LO_IDX(x, va)	(((va) >> PAGE_SHIFT) & 1)
-#define TLB_IS_VALID(x, va)	((x).tlb_lo[TLB_LO_IDX(x, va)] & MIPS3_PG_V)
+#define TLB_IS_VALID(x, va)	((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
 #define TLB_HI_VPN2_HIT(x, y)	((TLB_VPN2(x) & ~(x).tlb_mask) ==	\
 				 ((y) & VPN2_MASK & ~(x).tlb_mask))
 #define TLB_HI_ASID_HIT(x, y)	(TLB_IS_GLOBAL(x) ||			\
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 14996aa5e7c4f9..ad3125fa9c614d 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -115,10 +115,10 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
 	pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
 
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-		   (1 << 2) | (0x1 << 1);
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-		   (1 << 2) | (0x1 << 1);
+	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+		   (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
+	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+		   (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
 
 	preempt_disable();
 	entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
@@ -156,12 +156,14 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 	}
 
 	/* Get attributes from the Guest TLB */
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-		   (tlb->tlb_lo[0] & MIPS3_PG_D) |
-		   (tlb->tlb_lo[0] & MIPS3_PG_V);
-	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
-		   (tlb->tlb_lo[1] & MIPS3_PG_D) |
-		   (tlb->tlb_lo[1] & MIPS3_PG_V);
+	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+		   (0x3 << ENTRYLO_C_SHIFT) |
+		   (tlb->tlb_lo[0] & ENTRYLO_D) |
+		   (tlb->tlb_lo[0] & ENTRYLO_V);
+	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+		   (0x3 << ENTRYLO_C_SHIFT) |
+		   (tlb->tlb_lo[1] & ENTRYLO_D) |
+		   (tlb->tlb_lo[1] & ENTRYLO_V);
 
 	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
 		  tlb->tlb_lo[0], tlb->tlb_lo[1]);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 4825d0dbb65eee..8012e686d4ae51 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -86,19 +86,20 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
 	for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) {
 		tlb = vcpu->arch.guest_tlb[i];
 		kvm_info("TLB%c%3d Hi 0x%08lx ",
-			 (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & MIPS3_PG_V
+			 (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & ENTRYLO_V
 							? ' ' : '*',
 			 i, tlb.tlb_hi);
 		kvm_info("Lo0=0x%09llx %c%c attr %lx ",
 			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]),
-			 (tlb.tlb_lo[0] & MIPS3_PG_D) ? 'D' : ' ',
-			 (tlb.tlb_lo[0] & MIPS3_PG_G) ? 'G' : ' ',
-			 (tlb.tlb_lo[0] >> 3) & 7);
+			 (tlb.tlb_lo[0] & ENTRYLO_D) ? 'D' : ' ',
+			 (tlb.tlb_lo[0] & ENTRYLO_G) ? 'G' : ' ',
+			 (tlb.tlb_lo[0] & ENTRYLO_C) >> ENTRYLO_C_SHIFT);
 		kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
 			 (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]),
-			 (tlb.tlb_lo[1] & MIPS3_PG_D) ? 'D' : ' ',
-			 (tlb.tlb_lo[1] & MIPS3_PG_G) ? 'G' : ' ',
-			 (tlb.tlb_lo[1] >> 3) & 7, tlb.tlb_mask);
+			 (tlb.tlb_lo[1] & ENTRYLO_D) ? 'D' : ' ',
+			 (tlb.tlb_lo[1] & ENTRYLO_G) ? 'G' : ' ',
+			 (tlb.tlb_lo[1] & ENTRYLO_C) >> ENTRYLO_C_SHIFT,
+			 tlb.tlb_mask);
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
@@ -146,12 +147,12 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
 
 	/* Flush D-cache */
 	if (flush_dcache_mask) {
-		if (entrylo0 & MIPS3_PG_V) {
+		if (entrylo0 & ENTRYLO_V) {
 			++vcpu->stat.flush_dcache_exits;
 			flush_data_cache_page((entryhi & VPN2_MASK) &
 					      ~flush_dcache_mask);
 		}
-		if (entrylo1 & MIPS3_PG_V) {
+		if (entrylo1 & ENTRYLO_V) {
 			++vcpu->stat.flush_dcache_exits;
 			flush_data_cache_page(((entryhi & VPN2_MASK) &
 					       ~flush_dcache_mask) |
@@ -176,8 +177,8 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 
 	pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
 	pfn1 = 0;
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
-		   (1 << 2) | (0x1 << 1);
+	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+		   (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
 	entrylo1 = 0;
 
 	local_irq_save(flags);

From 3b08aec549a0314b8c3788bdc2a21096d53225e1 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:20 +0100
Subject: [PATCH 073/302] MIPS: KVM: Combine handle_tlb_ld/st_miss
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The handle_tlb_ld/st_miss handlers are logically equivalent and
textually almost identical, so combine their implementations into a
single kvm_trap_emul_handle_tlb_miss().

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/trap_emul.c | 71 +++++++++++----------------------------
 1 file changed, 19 insertions(+), 52 deletions(-)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index ecf0068bc95eb0..09b97fa9dabb7d 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -128,7 +128,7 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
 {
 	struct kvm_run *run = vcpu->run;
 	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
@@ -145,55 +145,8 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
 		}
 	} else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
 		   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-		kvm_debug("USER ADDR TLB LD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
-			  cause, opc, badvaddr);
-		er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
-		if (er == EMULATE_DONE)
-			ret = RESUME_GUEST;
-		else {
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		}
-	} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
-		/*
-		 * All KSEG0 faults are handled by KVM, as the guest kernel does
-		 * not expect to ever get them
-		 */
-		if (kvm_mips_handle_kseg0_tlb_fault
-		    (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		}
-	} else {
-		kvm_err("Illegal TLB LD fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
-			cause, opc, badvaddr);
-		kvm_mips_dump_host_tlbs();
-		kvm_arch_vcpu_dump_regs(vcpu);
-		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		ret = RESUME_HOST;
-	}
-	return ret;
-}
-
-static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
-{
-	struct kvm_run *run = vcpu->run;
-	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
-	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
-	u32 cause = vcpu->arch.host_cp0_cause;
-	enum emulation_result er = EMULATE_DONE;
-	int ret = RESUME_GUEST;
-
-	if (((badvaddr & PAGE_MASK) == KVM_GUEST_COMMPAGE_ADDR)
-	    && KVM_GUEST_KERNEL_MODE(vcpu)) {
-		if (kvm_mips_handle_commpage_tlb_fault(badvaddr, vcpu) < 0) {
-			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			ret = RESUME_HOST;
-		}
-	} else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
-		   || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
-		kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n",
-			  vcpu->arch.pc, badvaddr);
+		kvm_debug("USER ADDR TLB %s fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
+			  store ? "ST" : "LD", cause, opc, badvaddr);
 
 		/*
 		 * User Address (UA) fault, this could happen if
@@ -213,14 +166,18 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
 			ret = RESUME_HOST;
 		}
 	} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+		/*
+		 * All KSEG0 faults are handled by KVM, as the guest kernel does
+		 * not expect to ever get them
+		 */
 		if (kvm_mips_handle_kseg0_tlb_fault
 		    (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
 			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 			ret = RESUME_HOST;
 		}
 	} else {
-		kvm_err("Illegal TLB ST fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
-			cause, opc, badvaddr);
+		kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
+			store ? "ST" : "LD", cause, opc, badvaddr);
 		kvm_mips_dump_host_tlbs();
 		kvm_arch_vcpu_dump_regs(vcpu);
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -229,6 +186,16 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+	return kvm_trap_emul_handle_tlb_miss(vcpu, true);
+}
+
+static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+	return kvm_trap_emul_handle_tlb_miss(vcpu, false);
+}
+
 static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;

From 35fec26242bd3ff5a770789185852d27b44ffaec Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 9 Jun 2016 14:19:21 +0100
Subject: [PATCH 074/302] MIPS: KVM: Use va in kvm_get_inst()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Like other functions, make use of a local unsigned long va, for the
virtual address of the PC. This reduces the amount of verbose casting of
the opc pointer to an unsigned long.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mmu.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index ad3125fa9c614d..208f70409ccb44 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -327,17 +327,18 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	unsigned long paddr, flags, vpn2, asid;
+	unsigned long va = (unsigned long)opc;
 	u32 inst;
 	int index;
 
-	if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
-	    KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
+	if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
+	    KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
 		local_irq_save(flags);
-		index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc);
+		index = kvm_mips_host_tlb_lookup(vcpu, va);
 		if (index >= 0) {
 			inst = *(opc);
 		} else {
-			vpn2 = (unsigned long) opc & VPN2_MASK;
+			vpn2 = va & VPN2_MASK;
 			asid = kvm_read_c0_guest_entryhi(cop0) &
 						KVM_ENTRYHI_ASID;
 			index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
@@ -354,10 +355,8 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 			inst = *(opc);
 		}
 		local_irq_restore(flags);
-	} else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-		paddr =
-		    kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
-							  (unsigned long) opc);
+	} else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
+		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
 		inst = *(u32 *) CKSEG0ADDR(paddr);
 	} else {
 		kvm_err("%s: illegal address: %p\n", __func__, opc);

From f943176a7205a064da05f81fc94dccc4c7379010 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:10 +0100
Subject: [PATCH 075/302] MIPS: KVM: Generalise fpu_inuse for other state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename fpu_inuse and the related definitions to aux_inuse so it can be
used for lazy context management of other auxiliary processor state too,
such as VZ guest timer, watchpoints and performance counters.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |  8 +++----
 arch/mips/kvm/emulate.c          |  8 +++----
 arch/mips/kvm/mips.c             | 38 ++++++++++++++++----------------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index d0432b5f23434c..e6273850bab69d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -323,8 +323,8 @@ struct kvm_mips_tlb {
 	long tlb_lo[2];
 };
 
-#define KVM_MIPS_FPU_FPU	0x1
-#define KVM_MIPS_FPU_MSA	0x2
+#define KVM_MIPS_AUX_FPU	0x1
+#define KVM_MIPS_AUX_MSA	0x2
 
 #define KVM_MIPS_GUEST_TLB_SIZE	64
 struct kvm_vcpu_arch {
@@ -346,8 +346,8 @@ struct kvm_vcpu_arch {
 
 	/* FPU State */
 	struct mips_fpu_struct fpu;
-	/* Which FPU state is loaded (KVM_MIPS_FPU_*) */
-	unsigned int fpu_inuse;
+	/* Which auxiliary state is loaded (KVM_MIPS_AUX_*) */
+	unsigned int aux_inuse;
 
 	/* COP0 State */
 	struct mips_coproc *cop0;
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 5b89c08034052a..8647bd97b934ad 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1154,7 +1154,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				 * it first.
 				 */
 				if (change & ST0_CU1 && !(val & ST0_FR) &&
-				    vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+				    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
 					kvm_lose_fpu(vcpu);
 
 				/*
@@ -1165,7 +1165,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				 * the near future.
 				 */
 				if (change & ST0_CU1 &&
-				    vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+				    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
 					change_c0_status(ST0_CU1, val);
 
 				preempt_enable();
@@ -1200,7 +1200,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				 * context is already loaded.
 				 */
 				if (change & MIPS_CONF5_FRE &&
-				    vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+				    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
 					change_c0_config5(MIPS_CONF5_FRE, val);
 
 				/*
@@ -1210,7 +1210,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				 * quickly enabled again in the near future.
 				 */
 				if (change & MIPS_CONF5_MSAEN &&
-				    vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+				    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
 					change_c0_config5(MIPS_CONF5_MSAEN,
 							  val);
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 6e753761b5d6e5..9093262ff3cec4 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1447,7 +1447,7 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
 	 * not to clobber the status register directly via the commpage.
 	 */
 	if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) &&
-	    vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+	    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
 		kvm_lose_fpu(vcpu);
 
 	/*
@@ -1462,9 +1462,9 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
 	enable_fpu_hazard();
 
 	/* If guest FPU state not active, restore it now */
-	if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) {
+	if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
 		__kvm_restore_fpu(&vcpu->arch);
-		vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+		vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
 	}
 
 	preempt_enable();
@@ -1491,8 +1491,8 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
 		 * interacts with MSA state, so play it safe and save it first.
 		 */
 		if (!(sr & ST0_FR) &&
-		    (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU |
-				KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU)
+		    (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU |
+				KVM_MIPS_AUX_MSA)) == KVM_MIPS_AUX_FPU)
 			kvm_lose_fpu(vcpu);
 
 		change_c0_status(ST0_CU1 | ST0_FR, sr);
@@ -1506,20 +1506,20 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
 	set_c0_config5(MIPS_CONF5_MSAEN);
 	enable_fpu_hazard();
 
-	switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) {
-	case KVM_MIPS_FPU_FPU:
+	switch (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA)) {
+	case KVM_MIPS_AUX_FPU:
 		/*
 		 * Guest FPU state already loaded, only restore upper MSA state
 		 */
 		__kvm_restore_msa_upper(&vcpu->arch);
-		vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+		vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
 		break;
 	case 0:
 		/* Neither FPU or MSA already active, restore full MSA state */
 		__kvm_restore_msa(&vcpu->arch);
-		vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+		vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
 		if (kvm_mips_guest_has_fpu(&vcpu->arch))
-			vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+			vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
 		break;
 	default:
 		break;
@@ -1533,13 +1533,13 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
 void kvm_drop_fpu(struct kvm_vcpu *vcpu)
 {
 	preempt_disable();
-	if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+	if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
 		disable_msa();
-		vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA;
+		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA;
 	}
-	if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+	if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
 		clear_c0_status(ST0_CU1 | ST0_FR);
-		vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
 	}
 	preempt_enable();
 }
@@ -1555,7 +1555,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 	 */
 
 	preempt_disable();
-	if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+	if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
 		set_c0_config5(MIPS_CONF5_MSAEN);
 		enable_fpu_hazard();
 
@@ -1563,17 +1563,17 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 
 		/* Disable MSA & FPU */
 		disable_msa();
-		if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+		if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
 			clear_c0_status(ST0_CU1 | ST0_FR);
 			disable_fpu_hazard();
 		}
-		vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA);
-	} else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+		vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
+	} else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
 		set_c0_status(ST0_CU1);
 		enable_fpu_hazard();
 
 		__kvm_save_fpu(&vcpu->arch);
-		vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
 
 		/* Disable FPU */
 		clear_c0_status(ST0_CU1 | ST0_FR);

From 04ebebf45a6ec61a4405040ea47c4320be5ed229 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:11 +0100
Subject: [PATCH 076/302] MIPS: KVM: Add kvm_aux trace event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a MIPS specific trace event for auxiliary context operations
(notably FPU and MSA). Unfortunately the generic kvm_fpu trace event
isn't flexible enough to handle the range of interesting things that can
happen with FPU and MSA context.

The type of state being operated on is traced:
- FPU: Just the FPU registers.
- MSA: Just the upper half of the MSA vector registers (low half already
       loaded with FPU state).
- FPU & MSA: Full MSA vector state (includes FPU state).

As is the type of operation:
- Restore: State was enabled and restored.
- Save: State was saved and disabled.
- Enable: State was enabled (already loaded).
- Disable: State was disabled (kept loaded).
- Discard: State was discarded and disabled.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
[Fix remaining occurrence of "fpu_msa", change to "aux". - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c  | 11 +++++++++++
 arch/mips/kvm/trace.h | 46 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9093262ff3cec4..c0e8f8640f2bbf 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1465,6 +1465,9 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
 		__kvm_restore_fpu(&vcpu->arch);
 		vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU);
+	} else {
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_FPU);
 	}
 
 	preempt_enable();
@@ -1513,6 +1516,7 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
 		 */
 		__kvm_restore_msa_upper(&vcpu->arch);
 		vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_MSA);
 		break;
 	case 0:
 		/* Neither FPU or MSA already active, restore full MSA state */
@@ -1520,8 +1524,11 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
 		vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
 		if (kvm_mips_guest_has_fpu(&vcpu->arch))
 			vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE,
+			      KVM_TRACE_AUX_FPU_MSA);
 		break;
 	default:
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_MSA);
 		break;
 	}
 
@@ -1535,10 +1542,12 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu)
 	preempt_disable();
 	if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
 		disable_msa();
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_MSA);
 		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA;
 	}
 	if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
 		clear_c0_status(ST0_CU1 | ST0_FR);
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_FPU);
 		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
 	}
 	preempt_enable();
@@ -1560,6 +1569,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 		enable_fpu_hazard();
 
 		__kvm_save_msa(&vcpu->arch);
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
 
 		/* Disable MSA & FPU */
 		disable_msa();
@@ -1574,6 +1584,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 
 		__kvm_save_fpu(&vcpu->arch);
 		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
+		trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU);
 
 		/* Disable FPU */
 		clear_c0_status(ST0_CU1 | ST0_FR);
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index bd6437f67dc03b..f3ada591ca2538 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -38,6 +38,52 @@ TRACE_EVENT(kvm_exit,
 		      __entry->pc)
 );
 
+#define KVM_TRACE_AUX_RESTORE		0
+#define KVM_TRACE_AUX_SAVE		1
+#define KVM_TRACE_AUX_ENABLE		2
+#define KVM_TRACE_AUX_DISABLE		3
+#define KVM_TRACE_AUX_DISCARD		4
+
+#define KVM_TRACE_AUX_FPU		1
+#define KVM_TRACE_AUX_MSA		2
+#define KVM_TRACE_AUX_FPU_MSA		3
+
+#define kvm_trace_symbol_aux_op		\
+	{ KVM_TRACE_AUX_RESTORE, "restore" },	\
+	{ KVM_TRACE_AUX_SAVE,    "save" },	\
+	{ KVM_TRACE_AUX_ENABLE,  "enable" },	\
+	{ KVM_TRACE_AUX_DISABLE, "disable" },	\
+	{ KVM_TRACE_AUX_DISCARD, "discard" }
+
+#define kvm_trace_symbol_aux_state		\
+	{ KVM_TRACE_AUX_FPU,     "FPU" },	\
+	{ KVM_TRACE_AUX_MSA,     "MSA" },	\
+	{ KVM_TRACE_AUX_FPU_MSA, "FPU & MSA" }
+
+TRACE_EVENT(kvm_aux,
+	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op,
+		     unsigned int state),
+	    TP_ARGS(vcpu, op, state),
+	    TP_STRUCT__entry(
+			__field(unsigned long, pc)
+			__field(u8, op)
+			__field(u8, state)
+	    ),
+
+	    TP_fast_assign(
+			__entry->pc = vcpu->arch.pc;
+			__entry->op = op;
+			__entry->state = state;
+	    ),
+
+	    TP_printk("%s %s PC: 0x%08lx",
+		      __print_symbolic(__entry->op,
+				       kvm_trace_symbol_aux_op),
+		      __print_symbolic(__entry->state,
+				       kvm_trace_symbol_aux_state),
+		      __entry->pc)
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */

From 1e09e86ac13747903501004082bf1c5b7c6262b2 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:12 +0100
Subject: [PATCH 077/302] MIPS: KVM: Clean up kvm_exit trace event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clean up the MIPS kvm_exit trace event so that the exit reasons are
specified in a trace friendly way (via __print_symbolic), and so that
the exit reasons that derive straight from Cause.ExcCode values map
directly, allowing a single trace_kvm_exit() call to replace a bunch of
individual ones.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 22 ----------------
 arch/mips/kvm/emulate.c          |  4 +--
 arch/mips/kvm/mips.c             | 17 ++----------
 arch/mips/kvm/stats.c            | 21 ---------------
 arch/mips/kvm/trace.h            | 44 +++++++++++++++++++++++++++++---
 5 files changed, 45 insertions(+), 63 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index e6273850bab69d..b8cb7427074616 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -125,28 +125,6 @@ struct kvm_vcpu_stat {
 	u32 halt_wakeup;
 };
 
-enum kvm_mips_exit_types {
-	WAIT_EXITS,
-	CACHE_EXITS,
-	SIGNAL_EXITS,
-	INT_EXITS,
-	COP_UNUSABLE_EXITS,
-	TLBMOD_EXITS,
-	TLBMISS_LD_EXITS,
-	TLBMISS_ST_EXITS,
-	ADDRERR_ST_EXITS,
-	ADDRERR_LD_EXITS,
-	SYSCALL_EXITS,
-	RESVD_INST_EXITS,
-	BREAK_INST_EXITS,
-	TRAP_INST_EXITS,
-	MSA_FPE_EXITS,
-	FPE_EXITS,
-	MSA_DISABLED_EXITS,
-	FLUSH_DCACHE_EXITS,
-	MAX_KVM_MIPS_EXIT_TYPES
-};
-
 struct kvm_arch_memory_slot {
 };
 
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 8647bd97b934ad..fce08bda9ebccc 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -775,7 +775,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 		  vcpu->arch.pending_exceptions);
 
 	++vcpu->stat.wait_exits;
-	trace_kvm_exit(vcpu, WAIT_EXITS);
+	trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
 	if (!vcpu->arch.pending_exceptions) {
 		vcpu->arch.wait = 1;
 		kvm_vcpu_block(vcpu);
@@ -1718,7 +1718,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 
 	case cache_op:
 		++vcpu->stat.cache_exits;
-		trace_kvm_exit(vcpu, CACHE_EXITS);
+		trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
 		er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
 		break;
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index c0e8f8640f2bbf..e9e40b9dd9be3d 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1257,6 +1257,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	kvm_debug("kvm_mips_handle_exit: cause: %#x, PC: %p, kvm_run: %p, kvm_vcpu: %p\n",
 			cause, opc, run, vcpu);
+	trace_kvm_exit(vcpu, exccode);
 
 	/*
 	 * Do a privilege check, if in UM most of these exit conditions end up
@@ -1276,7 +1277,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		kvm_debug("[%d]EXCCODE_INT @ %p\n", vcpu->vcpu_id, opc);
 
 		++vcpu->stat.int_exits;
-		trace_kvm_exit(vcpu, INT_EXITS);
 
 		if (need_resched())
 			cond_resched();
@@ -1288,7 +1288,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		kvm_debug("EXCCODE_CPU: @ PC: %p\n", opc);
 
 		++vcpu->stat.cop_unusable_exits;
-		trace_kvm_exit(vcpu, COP_UNUSABLE_EXITS);
 		ret = kvm_mips_callbacks->handle_cop_unusable(vcpu);
 		/* XXXKYMA: Might need to return to user space */
 		if (run->exit_reason == KVM_EXIT_IRQ_WINDOW_OPEN)
@@ -1297,7 +1296,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	case EXCCODE_MOD:
 		++vcpu->stat.tlbmod_exits;
-		trace_kvm_exit(vcpu, TLBMOD_EXITS);
 		ret = kvm_mips_callbacks->handle_tlb_mod(vcpu);
 		break;
 
@@ -1307,7 +1305,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			  badvaddr);
 
 		++vcpu->stat.tlbmiss_st_exits;
-		trace_kvm_exit(vcpu, TLBMISS_ST_EXITS);
 		ret = kvm_mips_callbacks->handle_tlb_st_miss(vcpu);
 		break;
 
@@ -1316,61 +1313,51 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			  cause, opc, badvaddr);
 
 		++vcpu->stat.tlbmiss_ld_exits;
-		trace_kvm_exit(vcpu, TLBMISS_LD_EXITS);
 		ret = kvm_mips_callbacks->handle_tlb_ld_miss(vcpu);
 		break;
 
 	case EXCCODE_ADES:
 		++vcpu->stat.addrerr_st_exits;
-		trace_kvm_exit(vcpu, ADDRERR_ST_EXITS);
 		ret = kvm_mips_callbacks->handle_addr_err_st(vcpu);
 		break;
 
 	case EXCCODE_ADEL:
 		++vcpu->stat.addrerr_ld_exits;
-		trace_kvm_exit(vcpu, ADDRERR_LD_EXITS);
 		ret = kvm_mips_callbacks->handle_addr_err_ld(vcpu);
 		break;
 
 	case EXCCODE_SYS:
 		++vcpu->stat.syscall_exits;
-		trace_kvm_exit(vcpu, SYSCALL_EXITS);
 		ret = kvm_mips_callbacks->handle_syscall(vcpu);
 		break;
 
 	case EXCCODE_RI:
 		++vcpu->stat.resvd_inst_exits;
-		trace_kvm_exit(vcpu, RESVD_INST_EXITS);
 		ret = kvm_mips_callbacks->handle_res_inst(vcpu);
 		break;
 
 	case EXCCODE_BP:
 		++vcpu->stat.break_inst_exits;
-		trace_kvm_exit(vcpu, BREAK_INST_EXITS);
 		ret = kvm_mips_callbacks->handle_break(vcpu);
 		break;
 
 	case EXCCODE_TR:
 		++vcpu->stat.trap_inst_exits;
-		trace_kvm_exit(vcpu, TRAP_INST_EXITS);
 		ret = kvm_mips_callbacks->handle_trap(vcpu);
 		break;
 
 	case EXCCODE_MSAFPE:
 		++vcpu->stat.msa_fpe_exits;
-		trace_kvm_exit(vcpu, MSA_FPE_EXITS);
 		ret = kvm_mips_callbacks->handle_msa_fpe(vcpu);
 		break;
 
 	case EXCCODE_FPE:
 		++vcpu->stat.fpe_exits;
-		trace_kvm_exit(vcpu, FPE_EXITS);
 		ret = kvm_mips_callbacks->handle_fpe(vcpu);
 		break;
 
 	case EXCCODE_MSADIS:
 		++vcpu->stat.msa_disabled_exits;
-		trace_kvm_exit(vcpu, MSA_DISABLED_EXITS);
 		ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
 		break;
 
@@ -1397,7 +1384,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			run->exit_reason = KVM_EXIT_INTR;
 			ret = (-EINTR << 2) | RESUME_HOST;
 			++vcpu->stat.signal_exits;
-			trace_kvm_exit(vcpu, SIGNAL_EXITS);
+			trace_kvm_exit(vcpu, KVM_TRACE_EXIT_SIGNAL);
 		}
 	}
 
diff --git a/arch/mips/kvm/stats.c b/arch/mips/kvm/stats.c
index 888bb67070ac6d..53f851a615542a 100644
--- a/arch/mips/kvm/stats.c
+++ b/arch/mips/kvm/stats.c
@@ -11,27 +11,6 @@
 
 #include <linux/kvm_host.h>
 
-char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
-	"WAIT",
-	"CACHE",
-	"Signal",
-	"Interrupt",
-	"COP0/1 Unusable",
-	"TLB Mod",
-	"TLB Miss (LD)",
-	"TLB Miss (ST)",
-	"Address Err (ST)",
-	"Address Error (LD)",
-	"System Call",
-	"Reserved Inst",
-	"Break Inst",
-	"Trap Inst",
-	"MSA FPE",
-	"FPE",
-	"MSA Disabled",
-	"D-Cache Flushes",
-};
-
 char *kvm_cop0_str[N_MIPS_COPROC_REGS] = {
 	"Index",
 	"Random",
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index f3ada591ca2538..ffa6ee8f20250a 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -17,8 +17,45 @@
 #define TRACE_INCLUDE_PATH .
 #define TRACE_INCLUDE_FILE trace
 
-/* Tracepoints for VM eists */
-extern char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES];
+/* The first 32 exit reasons correspond to Cause.ExcCode */
+#define KVM_TRACE_EXIT_INT		 0
+#define KVM_TRACE_EXIT_TLBMOD		 1
+#define KVM_TRACE_EXIT_TLBMISS_LD	 2
+#define KVM_TRACE_EXIT_TLBMISS_ST	 3
+#define KVM_TRACE_EXIT_ADDRERR_LD	 4
+#define KVM_TRACE_EXIT_ADDRERR_ST	 5
+#define KVM_TRACE_EXIT_SYSCALL		 8
+#define KVM_TRACE_EXIT_BREAK_INST	 9
+#define KVM_TRACE_EXIT_RESVD_INST	10
+#define KVM_TRACE_EXIT_COP_UNUSABLE	11
+#define KVM_TRACE_EXIT_TRAP_INST	13
+#define KVM_TRACE_EXIT_MSA_FPE		14
+#define KVM_TRACE_EXIT_FPE		15
+#define KVM_TRACE_EXIT_MSA_DISABLED	21
+/* Further exit reasons */
+#define KVM_TRACE_EXIT_WAIT		32
+#define KVM_TRACE_EXIT_CACHE		33
+#define KVM_TRACE_EXIT_SIGNAL		34
+
+/* Tracepoints for VM exits */
+#define kvm_trace_symbol_exit_types				\
+	{ KVM_TRACE_EXIT_INT,		"Interrupt" },		\
+	{ KVM_TRACE_EXIT_TLBMOD,	"TLB Mod" },		\
+	{ KVM_TRACE_EXIT_TLBMISS_LD,	"TLB Miss (LD)" },	\
+	{ KVM_TRACE_EXIT_TLBMISS_ST,	"TLB Miss (ST)" },	\
+	{ KVM_TRACE_EXIT_ADDRERR_LD,	"Address Error (LD)" },	\
+	{ KVM_TRACE_EXIT_ADDRERR_ST,	"Address Err (ST)" },	\
+	{ KVM_TRACE_EXIT_SYSCALL,	"System Call" },	\
+	{ KVM_TRACE_EXIT_BREAK_INST,	"Break Inst" },		\
+	{ KVM_TRACE_EXIT_RESVD_INST,	"Reserved Inst" },	\
+	{ KVM_TRACE_EXIT_COP_UNUSABLE,	"COP0/1 Unusable" },	\
+	{ KVM_TRACE_EXIT_TRAP_INST,	"Trap Inst" },		\
+	{ KVM_TRACE_EXIT_MSA_FPE,	"MSA FPE" },		\
+	{ KVM_TRACE_EXIT_FPE,		"FPE" },		\
+	{ KVM_TRACE_EXIT_MSA_DISABLED,	"MSA Disabled" },	\
+	{ KVM_TRACE_EXIT_WAIT,		"WAIT" },		\
+	{ KVM_TRACE_EXIT_CACHE,		"CACHE" },		\
+	{ KVM_TRACE_EXIT_SIGNAL,	"Signal" }
 
 TRACE_EVENT(kvm_exit,
 	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -34,7 +71,8 @@ TRACE_EVENT(kvm_exit,
 	    ),
 
 	    TP_printk("[%s]PC: 0x%08lx",
-		      kvm_mips_exit_types_str[__entry->reason],
+		      __print_symbolic(__entry->reason,
+				       kvm_trace_symbol_exit_types),
 		      __entry->pc)
 );
 

From 9887d1c75ba2f210b403d536ab025d4b2b36fb57 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:13 +0100
Subject: [PATCH 078/302] MIPS: KVM: Add kvm_asid_change trace event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a trace event for guest ASID changes, replacing the existing
kvm_debug call.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c |  7 +++----
 arch/mips/kvm/trace.h   | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index fce08bda9ebccc..ee0e61d2b6fb26 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1082,11 +1082,10 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
 				    ((kvm_read_c0_guest_entryhi(cop0) &
 				      KVM_ENTRYHI_ASID) != nasid)) {
-					kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
+					trace_kvm_asid_change(vcpu,
 						kvm_read_c0_guest_entryhi(cop0)
-						& KVM_ENTRYHI_ASID,
-						vcpu->arch.gprs[rt]
-						& KVM_ENTRYHI_ASID);
+							& KVM_ENTRYHI_ASID,
+						nasid);
 
 					/* Blow away the shadow host TLBs */
 					kvm_mips_flush_host_tlb(1);
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index ffa6ee8f20250a..7daf7474d6a689 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -122,6 +122,28 @@ TRACE_EVENT(kvm_aux,
 		      __entry->pc)
 );
 
+TRACE_EVENT(kvm_asid_change,
+	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int old_asid,
+		     unsigned int new_asid),
+	    TP_ARGS(vcpu, old_asid, new_asid),
+	    TP_STRUCT__entry(
+			__field(unsigned long, pc)
+			__field(u8, old_asid)
+			__field(u8, new_asid)
+	    ),
+
+	    TP_fast_assign(
+			__entry->pc = vcpu->arch.pc;
+			__entry->old_asid = old_asid;
+			__entry->new_asid = new_asid;
+	    ),
+
+	    TP_printk("PC: 0x%08lx old: 0x%02x new: 0x%02x",
+		      __entry->pc,
+		      __entry->old_asid,
+		      __entry->new_asid)
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */

From 93258604ab6d3f2bdc6cb02f61961af56712f144 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:14 +0100
Subject: [PATCH 079/302] MIPS: KVM: Add guest mode switch trace events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a few trace events for entering and coming out of guest mode, as well
as re-entering it from a guest exit exception.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c  |  4 ++++
 arch/mips/kvm/trace.h | 48 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e9e40b9dd9be3d..b5ad2ba1847ab4 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -410,7 +410,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	/* Disable hardware page table walking while in guest */
 	htw_stop();
 
+	trace_kvm_enter(vcpu);
 	r = vcpu->arch.vcpu_run(run, vcpu);
+	trace_kvm_out(vcpu);
 
 	/* Re-enable HTW before enabling interrupts */
 	htw_start();
@@ -1389,6 +1391,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	if (ret == RESUME_GUEST) {
+		trace_kvm_reenter(vcpu);
+
 		/*
 		 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
 		 * is live), restore FCR31 / MSACSR.
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index 7daf7474d6a689..aec1c43f2b4423 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -17,6 +17,54 @@
 #define TRACE_INCLUDE_PATH .
 #define TRACE_INCLUDE_FILE trace
 
+/*
+ * Tracepoints for VM enters
+ */
+TRACE_EVENT(kvm_enter,
+	    TP_PROTO(struct kvm_vcpu *vcpu),
+	    TP_ARGS(vcpu),
+	    TP_STRUCT__entry(
+			__field(unsigned long, pc)
+	    ),
+
+	    TP_fast_assign(
+			__entry->pc = vcpu->arch.pc;
+	    ),
+
+	    TP_printk("PC: 0x%08lx",
+		      __entry->pc)
+);
+
+TRACE_EVENT(kvm_reenter,
+	    TP_PROTO(struct kvm_vcpu *vcpu),
+	    TP_ARGS(vcpu),
+	    TP_STRUCT__entry(
+			__field(unsigned long, pc)
+	    ),
+
+	    TP_fast_assign(
+			__entry->pc = vcpu->arch.pc;
+	    ),
+
+	    TP_printk("PC: 0x%08lx",
+		      __entry->pc)
+);
+
+TRACE_EVENT(kvm_out,
+	    TP_PROTO(struct kvm_vcpu *vcpu),
+	    TP_ARGS(vcpu),
+	    TP_STRUCT__entry(
+			__field(unsigned long, pc)
+	    ),
+
+	    TP_fast_assign(
+			__entry->pc = vcpu->arch.pc;
+	    ),
+
+	    TP_printk("PC: 0x%08lx",
+		      __entry->pc)
+);
+
 /* The first 32 exit reasons correspond to Cause.ExcCode */
 #define KVM_TRACE_EXIT_INT		 0
 #define KVM_TRACE_EXIT_TLBMOD		 1

From 6398da1391ba9285aeb4fa3f4470f008bf730220 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:15 +0100
Subject: [PATCH 080/302] MIPS: KVM: Trace guest register access emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trace emulation of guest access to various registers via
MFC0/MTC0/DMFC0/DMTC0 instructions (coprocessor 0) and the RDHWR
instruction (hardware registers exposed to userland), replacing some
existing kvm_debug calls. Trace events are much more practical for this
kind of debug output.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 31 +++++++++------
 arch/mips/kvm/trace.h   | 88 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index ee0e61d2b6fb26..2004e35288d097 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -979,7 +979,6 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	enum emulation_result er = EMULATE_DONE;
 	u32 rt, rd, copz, sel, co_bit, op;
-	unsigned long pc = vcpu->arch.pc;
 	unsigned long curr_pc;
 
 	/*
@@ -1046,20 +1045,27 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 #endif
 			}
 
-			kvm_debug
-			    ("[%#lx] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n",
-			     pc, rd, sel, rt, vcpu->arch.gprs[rt]);
-
+			trace_kvm_hwr(vcpu, KVM_TRACE_MFC0,
+				      KVM_TRACE_COP0(rd, sel),
+				      vcpu->arch.gprs[rt]);
 			break;
 
 		case dmfc_op:
 			vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+
+			trace_kvm_hwr(vcpu, KVM_TRACE_DMFC0,
+				      KVM_TRACE_COP0(rd, sel),
+				      vcpu->arch.gprs[rt]);
 			break;
 
 		case mtc_op:
 #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
 			cop0->stat[rd][sel]++;
 #endif
+			trace_kvm_hwr(vcpu, KVM_TRACE_MTC0,
+				      KVM_TRACE_COP0(rd, sel),
+				      vcpu->arch.gprs[rt]);
+
 			if ((rd == MIPS_CP0_TLB_INDEX)
 			    && (vcpu->arch.gprs[rt] >=
 				KVM_MIPS_GUEST_TLB_SIZE)) {
@@ -1098,10 +1104,6 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
 				goto done;
 			} else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) {
-				kvm_debug("[%#lx] MTCz, COMPARE %#lx <- %#lx\n",
-					  pc, kvm_read_c0_guest_compare(cop0),
-					  vcpu->arch.gprs[rt]);
-
 				/* If we are writing to COMPARE */
 				/* Clear pending timer interrupt, if any */
 				kvm_mips_write_compare(vcpu,
@@ -1237,14 +1239,14 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				kvm_mips_trans_mtc0(inst, opc, vcpu);
 #endif
 			}
-
-			kvm_debug("[%#lx] MTCz, cop0->reg[%d][%d]: %#lx\n", pc,
-				  rd, sel, cop0->reg[rd][sel]);
 			break;
 
 		case dmtc_op:
 			kvm_err("!!!!!!![%#lx]dmtc_op: rt: %d, rd: %d, sel: %d!!!!!!\n",
 				vcpu->arch.pc, rt, rd, sel);
+			trace_kvm_hwr(vcpu, KVM_TRACE_DMTC0,
+				      KVM_TRACE_COP0(rd, sel),
+				      vcpu->arch.gprs[rt]);
 			er = EMULATE_FAIL;
 			break;
 
@@ -2307,6 +2309,8 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 		int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
 		int rd = (inst & RD) >> 11;
 		int rt = (inst & RT) >> 16;
+		int sel = (inst >> 6) & 0x7;
+
 		/* If usermode, check RDHWR rd is allowed by guest HWREna */
 		if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
 			kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
@@ -2342,6 +2346,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 			kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
 			goto emulate_ri;
 		}
+
+		trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel),
+			      vcpu->arch.gprs[rt]);
 	} else {
 		kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
 		goto emulate_ri;
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index aec1c43f2b4423..5d712ecb07344b 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -124,6 +124,94 @@ TRACE_EVENT(kvm_exit,
 		      __entry->pc)
 );
 
+#define KVM_TRACE_MFC0		0
+#define KVM_TRACE_MTC0		1
+#define KVM_TRACE_DMFC0		2
+#define KVM_TRACE_DMTC0		3
+#define KVM_TRACE_RDHWR		4
+
+#define KVM_TRACE_HWR_COP0	0
+#define KVM_TRACE_HWR_HWR	1
+
+#define KVM_TRACE_COP0(REG, SEL)	((KVM_TRACE_HWR_COP0 << 8) |	\
+					 ((REG) << 3) | (SEL))
+#define KVM_TRACE_HWR(REG, SEL)		((KVM_TRACE_HWR_HWR  << 8) |	\
+					 ((REG) << 3) | (SEL))
+
+#define kvm_trace_symbol_hwr_ops				\
+	{ KVM_TRACE_MFC0,		"MFC0" },		\
+	{ KVM_TRACE_MTC0,		"MTC0" },		\
+	{ KVM_TRACE_DMFC0,		"DMFC0" },		\
+	{ KVM_TRACE_DMTC0,		"DMTC0" },		\
+	{ KVM_TRACE_RDHWR,		"RDHWR" }
+
+#define kvm_trace_symbol_hwr_cop				\
+	{ KVM_TRACE_HWR_COP0,		"COP0" },		\
+	{ KVM_TRACE_HWR_HWR,		"HWR" }
+
+#define kvm_trace_symbol_hwr_regs				\
+	{ KVM_TRACE_COP0( 0, 0),	"Index" },		\
+	{ KVM_TRACE_COP0( 2, 0),	"EntryLo0" },		\
+	{ KVM_TRACE_COP0( 3, 0),	"EntryLo1" },		\
+	{ KVM_TRACE_COP0( 4, 0),	"Context" },		\
+	{ KVM_TRACE_COP0( 4, 2),	"UserLocal" },		\
+	{ KVM_TRACE_COP0( 5, 0),	"PageMask" },		\
+	{ KVM_TRACE_COP0( 6, 0),	"Wired" },		\
+	{ KVM_TRACE_COP0( 7, 0),	"HWREna" },		\
+	{ KVM_TRACE_COP0( 8, 0),	"BadVAddr" },		\
+	{ KVM_TRACE_COP0( 9, 0),	"Count" },		\
+	{ KVM_TRACE_COP0(10, 0),	"EntryHi" },		\
+	{ KVM_TRACE_COP0(11, 0),	"Compare" },		\
+	{ KVM_TRACE_COP0(12, 0),	"Status" },		\
+	{ KVM_TRACE_COP0(12, 1),	"IntCtl" },		\
+	{ KVM_TRACE_COP0(12, 2),	"SRSCtl" },		\
+	{ KVM_TRACE_COP0(13, 0),	"Cause" },		\
+	{ KVM_TRACE_COP0(14, 0),	"EPC" },		\
+	{ KVM_TRACE_COP0(15, 0),	"PRId" },		\
+	{ KVM_TRACE_COP0(15, 1),	"EBase" },		\
+	{ KVM_TRACE_COP0(16, 0),	"Config" },		\
+	{ KVM_TRACE_COP0(16, 1),	"Config1" },		\
+	{ KVM_TRACE_COP0(16, 2),	"Config2" },		\
+	{ KVM_TRACE_COP0(16, 3),	"Config3" },		\
+	{ KVM_TRACE_COP0(16, 4),	"Config4" },		\
+	{ KVM_TRACE_COP0(16, 5),	"Config5" },		\
+	{ KVM_TRACE_COP0(16, 7),	"Config7" },		\
+	{ KVM_TRACE_COP0(26, 0),	"ECC" },		\
+	{ KVM_TRACE_COP0(30, 0),	"ErrorEPC" },		\
+	{ KVM_TRACE_HWR( 0, 0),		"CPUNum" },		\
+	{ KVM_TRACE_HWR( 1, 0),		"SYNCI_Step" },		\
+	{ KVM_TRACE_HWR( 2, 0),		"CC" },			\
+	{ KVM_TRACE_HWR( 3, 0),		"CCRes" },		\
+	{ KVM_TRACE_HWR(29, 0),		"ULR" }
+
+TRACE_EVENT(kvm_hwr,
+	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, unsigned int reg,
+		     unsigned long val),
+	    TP_ARGS(vcpu, op, reg, val),
+	    TP_STRUCT__entry(
+			__field(unsigned long, val)
+			__field(u16, reg)
+			__field(u8, op)
+	    ),
+
+	    TP_fast_assign(
+			__entry->val = val;
+			__entry->reg = reg;
+			__entry->op = op;
+	    ),
+
+	    TP_printk("%s %s (%s:%u:%u) 0x%08lx",
+		      __print_symbolic(__entry->op,
+				       kvm_trace_symbol_hwr_ops),
+		      __print_symbolic(__entry->reg,
+				       kvm_trace_symbol_hwr_regs),
+		      __print_symbolic(__entry->reg >> 8,
+				       kvm_trace_symbol_hwr_cop),
+		      (__entry->reg >> 3) & 0x1f,
+		      __entry->reg & 0x7,
+		      __entry->val)
+);
+
 #define KVM_TRACE_AUX_RESTORE		0
 #define KVM_TRACE_AUX_SAVE		1
 #define KVM_TRACE_AUX_ENABLE		2

From eafc4ed206c562d205de9d2acf40d57616faaf03 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:16 +0100
Subject: [PATCH 081/302] MIPS: KVM: Dump guest tlbs if kvm_get_inst() fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If kvm_get_inst() fails to find a guest TLB mapping for the guest PC
then dump the guest TLB entries. The contents of the guest TLB is likely
to be more interesting than the host TLB entries.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: kvm@vger.kernel.org
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 208f70409ccb44..2f494ec5c939c0 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -346,6 +346,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 				kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
 					__func__, opc, vcpu, read_c0_entryhi());
 				kvm_mips_dump_host_tlbs();
+				kvm_mips_dump_guest_tlbs(vcpu);
 				local_irq_restore(flags);
 				return KVM_INVALID_INST;
 			}

From d86c1ebe8e3d8a13aea9ce8437405d0ea3765698 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 14 Jun 2016 09:40:17 +0100
Subject: [PATCH 082/302] MIPS: KVM: Print unknown load/store encodings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When trying to emulate an unrecognised load or store instruction, print
the encoding to aid debug.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 2004e35288d097..ff4072c2b25e36 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1412,7 +1412,8 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 		break;
 
 	default:
-		kvm_err("Store not yet supported");
+		kvm_err("Store not yet supported (inst=0x%08x)\n",
+			inst);
 		er = EMULATE_FAIL;
 		break;
 	}
@@ -1522,7 +1523,8 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause,
 		break;
 
 	default:
-		kvm_err("Load not yet supported");
+		kvm_err("Load not yet supported (inst=0x%08x)\n",
+			inst);
 		er = EMULATE_FAIL;
 		break;
 	}

From 6a727b0b3f9305b2c9f107decee3d8b9122de9e1 Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 21 May 2016 13:48:35 +0200
Subject: [PATCH 083/302] KVM: ARM: Fix typos

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/arm.c     | 2 +-
 arch/arm/kvm/emulate.c | 2 +-
 arch/arm/kvm/guest.c   | 2 +-
 arch/arm/kvm/reset.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 893941ec98dc6d..f20ca84537f5d7 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -376,7 +376,7 @@ void force_vm_exit(const cpumask_t *mask)
 
 /**
  * need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to checkt
+ * @kvm: The VM's VMID to check
  *
  * return true if there is a new generation of VMIDs being used
  *
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index a494def3f19569..af93e3ffc9f308 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -210,7 +210,7 @@ bool kvm_condition_valid(struct kvm_vcpu *vcpu)
  * @vcpu:	The VCPU pointer
  *
  * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
  * to do this little bit of work manually. The fields map like this:
  *
  * IT[7:0] -> CPSR[26:25],CPSR[15:10]
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 9093ed0f8b2a71..9aca92074f8546 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -182,7 +182,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 /**
  * kvm_arm_copy_reg_indices - get indices of all registers.
  *
- * We do core registers right here, then we apppend coproc regs.
+ * We do core registers right here, then we append coproc regs.
  */
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 {
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index 0048b5a62a509b..4b5e802e57d1b6 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -52,7 +52,7 @@ static const struct kvm_irq_level cortexa_vtimer_irq = {
  * @vcpu: The VCPU pointer
  *
  * This function finds the right table above and sets the registers on the
- * virtual CPU struct to their architectually defined reset values.
+ * virtual CPU struct to their architecturally defined reset values.
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {

From edce2292c1e026c6a2da6899c114d930bc1f518b Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 21 May 2016 13:53:14 +0200
Subject: [PATCH 084/302] KVM: ARM64: Fix typos

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/include/asm/kvm_arm.h | 2 +-
 arch/arm64/kvm/guest.c           | 2 +-
 arch/arm64/kvm/reset.c           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 2cdb6b551ac620..4b5c977af4653d 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -178,7 +178,7 @@
 /* Hyp System Trap Register */
 #define HSTR_EL2_T(x)	(1 << x)
 
-/* Hyp Coproccessor Trap Register Shifts */
+/* Hyp Coprocessor Trap Register Shifts */
 #define CPTR_EL2_TFP_SHIFT 10
 
 /* Hyp Coprocessor Trap Register */
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 32fad75bb9ff59..3f9e15722473bc 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -211,7 +211,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
 /**
  * kvm_arm_copy_reg_indices - get indices of all registers.
  *
- * We do core registers right here, then we apppend system regs.
+ * We do core registers right here, then we append system regs.
  */
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 {
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index b1ad730e156748..7be24f2b18dbd7 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -98,7 +98,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
  * @vcpu: The VCPU pointer
  *
  * This function finds the right table above and sets the registers on
- * the virtual CPU struct to their architectually defined reset
+ * the virtual CPU struct to their architecturally defined reset
  * values.
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)

From 960cb306e63d4efde7753c0a2f2cef523a41e8ec Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 21 May 2016 14:08:55 +0200
Subject: [PATCH 085/302] KVM: S390: Fix typo

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/guestdbg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index e8c6843b9600cd..1e0849e209650d 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -465,7 +465,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 		guest_perc &= ~PER_EVENT_IFETCH;
 
 	/* All other PER events will be given to the guest */
-	/* TODO: Check alterated address/address space */
+	/* TODO: Check altered address/address space */
 
 	vcpu->arch.sie_block->perc = guest_perc >> 24;
 

From bb3541f175a977198d128f3a4e13534e019754a3 Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 21 May 2016 14:14:44 +0200
Subject: [PATCH 086/302] KVM: x86: Fix typos

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/locking.txt | 4 ++--
 arch/x86/kvm/mmu.c                    | 2 +-
 arch/x86/kvm/pmu_intel.c              | 2 +-
 arch/x86/kvm/svm.c                    | 2 +-
 arch/x86/kvm/vmx.c                    | 2 +-
 arch/x86/kvm/x86.c                    | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 19f94a6b9bb0df..f2491a8c68b4a6 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -89,7 +89,7 @@ In mmu_spte_clear_track_bits():
    old_spte = *spte;
 
    /* 'if' condition is satisfied. */
-   if (old_spte.Accssed == 1 &&
+   if (old_spte.Accessed == 1 &&
         old_spte.W == 0)
       spte = 0ull;
                                          on fast page fault path:
@@ -102,7 +102,7 @@ In mmu_spte_clear_track_bits():
       old_spte = xchg(spte, 0ull)
 
 
-   if (old_spte.Accssed == 1)
+   if (old_spte.Accessed == 1)
       kvm_set_pfn_accessed(spte.pfn);
    if (old_spte.Dirty == 1)
       kvm_set_pfn_dirty(spte.pfn);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index def97b3a392b52..837bf23c5b067c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -523,7 +523,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 }
 
 /* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changged.
+ * Update the state bits, it means the mapped pfn is not changed.
  *
  * Whenever we overwrite a writable spte with a read-only one we
  * should flush remote TLBs. Otherwise rmap_write_protect
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index ab38af4f4947f6..9d4a8504a95a3b 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx)
 	return intel_arch_events[fixed_pmc_events[idx]].event_type;
 }
 
-/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
 static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
 {
 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1163e8173e5a71..5ff2927781100f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1572,7 +1572,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
        /*
-        * Any change of EFLAGS.VM is accompained by a reload of SS
+        * Any change of EFLAGS.VM is accompanied by a reload of SS
         * (caused by either a task switch or an inter-privilege IRET),
         * so we do not need to update the CPL here.
         */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fb93010beaa4df..57ec6a4b49581e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3364,7 +3364,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 
 	/*
 	 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
-	 * but due to arrata below it can't be used. Workaround is to use
+	 * but due to errata below it can't be used. Workaround is to use
 	 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
 	 *
 	 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d6a305936553d..bf227212aebb15 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8418,7 +8418,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	/*
 	 * When producer of consumer is unregistered, we change back to
 	 * remapped mode, so we can re-use the current implementation
-	 * when the irq is masked/disabed or the consumer side (KVM
+	 * when the irq is masked/disabled or the consumer side (KVM
 	 * int this case doesn't want to receive the interrupts.
 	*/
 	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);

From 66ffc50c480e7ab6ad5642f47276435a8873c31a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:45 +0100
Subject: [PATCH 087/302] MIPS: KVM: Fix translation of MFC0 ErrCtl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MIPS KVM dynamic translation is meant to translate "MFC0 rt, ErrCtl"
instructions into "ADD rt, zero, zero" to zero the destination register,
however the rt register number was copied into rt of the ADD instruction
encoding, which is the 2nd source operand. This results in "ADD zero,
zero, rt" which is a no-op, so only the first execution of each such
MFC0 from ErrCtl will actually read 0.

Fix the shift to put the rt from the MFC0 encoding into the rd field of
the ADD.

Fixes: 50c8308538dc ("KVM/MIPS32: Binary patching of select privileged instructions.")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/dyntrans.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index d4a86fb239cdf4..79b134c913336a 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -82,7 +82,7 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 
 	if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
 		mfc0_inst = CLEAR_TEMPLATE;
-		mfc0_inst |= ((rt & 0x1f) << 16);
+		mfc0_inst |= ((rt & 0x1f) << 11);
 	} else {
 		mfc0_inst = LW_TEMPLATE;
 		mfc0_inst |= ((rt & 0x1f) << 16);

From d5cd26bcfc881f5443d510e3acd40b30d7b7d0df Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:46 +0100
Subject: [PATCH 088/302] MIPS: KVM: Factor writing of translated guest
 instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code in kvm_mips_dyntrans.c to write a translated guest instruction
to guest memory depending on the segment is duplicated between each of
the functions. Additionally the cache op translation functions assume
the instruction is in the KSEG0/1 segment rather than KSEG2/3, which is
generally true but isn't guaranteed.

Factor that code into a new kvm_mips_trans_replace() which handles both
KSEG0/1 and KSEG2/3.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/dyntrans.c | 92 +++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 79b134c913336a..eb6e0d17a6682f 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -28,21 +28,41 @@
 #define CLEAR_TEMPLATE  0x00000020
 #define SW_TEMPLATE     0xac000000
 
+/**
+ * kvm_mips_trans_replace() - Replace trapping instruction in guest memory.
+ * @vcpu:	Virtual CPU.
+ * @opc:	PC of instruction to replace.
+ * @replace:	Instruction to write
+ */
+static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, u32 replace)
+{
+	unsigned long kseg0_opc, flags;
+
+	if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
+		kseg0_opc =
+		    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
+			       (vcpu, (unsigned long) opc));
+		memcpy((void *)kseg0_opc, (void *)&replace, sizeof(u32));
+		local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
+	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
+		local_irq_save(flags);
+		memcpy((void *)opc, (void *)&replace, sizeof(u32));
+		local_flush_icache_range((unsigned long)opc,
+					 (unsigned long)opc + 32);
+		local_irq_restore(flags);
+	} else {
+		kvm_err("%s: Invalid address: %p\n", __func__, opc);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
 int kvm_mips_trans_cache_index(u32 inst, u32 *opc,
 			       struct kvm_vcpu *vcpu)
 {
-	int result = 0;
-	unsigned long kseg0_opc;
-	u32 synci_inst = 0x0;
-
 	/* Replace the CACHE instruction, with a NOP */
-	kseg0_opc =
-	    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-		       (vcpu, (unsigned long) opc));
-	memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32));
-	local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
-	return result;
+	return kvm_mips_trans_replace(vcpu, opc, 0x00000000);
 }
 
 /*
@@ -52,8 +72,6 @@ int kvm_mips_trans_cache_index(u32 inst, u32 *opc,
 int kvm_mips_trans_cache_va(u32 inst, u32 *opc,
 			    struct kvm_vcpu *vcpu)
 {
-	int result = 0;
-	unsigned long kseg0_opc;
 	u32 synci_inst = SYNCI_TEMPLATE, base, offset;
 
 	base = (inst >> 21) & 0x1f;
@@ -61,20 +79,13 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc,
 	synci_inst |= (base << 21);
 	synci_inst |= offset;
 
-	kseg0_opc =
-	    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-		       (vcpu, (unsigned long) opc));
-	memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32));
-	local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
-	return result;
+	return kvm_mips_trans_replace(vcpu, opc, synci_inst);
 }
 
 int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 {
 	u32 rt, rd, sel;
 	u32 mfc0_inst;
-	unsigned long kseg0_opc, flags;
 
 	rt = (inst >> 16) & 0x1f;
 	rd = (inst >> 11) & 0x1f;
@@ -90,31 +101,13 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 				      cop0.reg[rd][sel]);
 	}
 
-	if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-		kseg0_opc =
-		    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-			       (vcpu, (unsigned long) opc));
-		memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(u32));
-		local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-		local_irq_save(flags);
-		memcpy((void *)opc, (void *)&mfc0_inst, sizeof(u32));
-		local_flush_icache_range((unsigned long)opc,
-					 (unsigned long)opc + 32);
-		local_irq_restore(flags);
-	} else {
-		kvm_err("%s: Invalid address: %p\n", __func__, opc);
-		return -EFAULT;
-	}
-
-	return 0;
+	return kvm_mips_trans_replace(vcpu, opc, mfc0_inst);
 }
 
 int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 {
 	u32 rt, rd, sel;
 	u32 mtc0_inst = SW_TEMPLATE;
-	unsigned long kseg0_opc, flags;
 
 	rt = (inst >> 16) & 0x1f;
 	rd = (inst >> 11) & 0x1f;
@@ -123,22 +116,5 @@ int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
 	mtc0_inst |= ((rt & 0x1f) << 16);
 	mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
 
-	if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-		kseg0_opc =
-		    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-			       (vcpu, (unsigned long) opc));
-		memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(u32));
-		local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
-		local_irq_save(flags);
-		memcpy((void *)opc, (void *)&mtc0_inst, sizeof(u32));
-		local_flush_icache_range((unsigned long)opc,
-					 (unsigned long)opc + 32);
-		local_irq_restore(flags);
-	} else {
-		kvm_err("%s: Invalid address: %p\n", __func__, opc);
-		return -EFAULT;
-	}
-
-	return 0;
+	return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
 }

From 258f3a2ea93ff7e322006c716cedc4fa3d861453 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:47 +0100
Subject: [PATCH 089/302] MIPS: KVM: Convert emulation to use asm/inst.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert various MIPS KVM guest instruction emulation functions to decode
instructions (and encode translations) using the union mips_instruction
and related enumerations in asm/inst.h rather than #defines and
hardcoded values.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h  |  22 +++---
 arch/mips/include/uapi/asm/inst.h |  35 +++++++++-
 arch/mips/kvm/dyntrans.c          |  74 ++++++++++----------
 arch/mips/kvm/emulate.c           | 109 ++++++++++++------------------
 4 files changed, 126 insertions(+), 114 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b8cb7427074616..1e002136f514b6 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -19,6 +19,7 @@
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 
+#include <asm/inst.h>
 #include <asm/mipsregs.h>
 
 /* MIPS KVM register ids */
@@ -733,21 +734,21 @@ enum emulation_result kvm_mips_check_privilege(u32 cause,
 					       struct kvm_run *run,
 					       struct kvm_vcpu *vcpu);
 
-enum emulation_result kvm_mips_emulate_cache(u32 inst,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 					     u32 *opc,
 					     u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_CP0(u32 inst,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					   u32 *opc,
 					   u32 cause,
 					   struct kvm_run *run,
 					   struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_store(u32 inst,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
 					     u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_load(u32 inst,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 					    u32 cause,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu);
@@ -758,11 +759,14 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
 /* Dynamic binary translation */
-extern int kvm_mips_trans_cache_index(u32 inst, u32 *opc,
-				      struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_cache_va(u32 inst, u32 *opc, struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_cache_index(union mips_instruction inst,
+				      u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
+				   struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
+			       struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
+			       struct kvm_vcpu *vcpu);
 
 /* Misc */
 extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 8051f9aa13796f..a1ebf973725c79 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -103,7 +103,7 @@ enum rt_op {
 	bltzal_op, bgezal_op, bltzall_op, bgezall_op,
 	rt_op_0x14, rt_op_0x15, rt_op_0x16, rt_op_0x17,
 	rt_op_0x18, rt_op_0x19, rt_op_0x1a, rt_op_0x1b,
-	bposge32_op, rt_op_0x1d, rt_op_0x1e, rt_op_0x1f
+	bposge32_op, rt_op_0x1d, rt_op_0x1e, synci_op
 };
 
 /*
@@ -586,6 +586,36 @@ struct r_format {			/* Register format */
 	;))))))
 };
 
+struct c0r_format {			/* C0 register format */
+	__BITFIELD_FIELD(unsigned int opcode : 6,
+	__BITFIELD_FIELD(unsigned int rs : 5,
+	__BITFIELD_FIELD(unsigned int rt : 5,
+	__BITFIELD_FIELD(unsigned int rd : 5,
+	__BITFIELD_FIELD(unsigned int z: 8,
+	__BITFIELD_FIELD(unsigned int sel : 3,
+	;))))))
+};
+
+struct mfmc0_format {			/* MFMC0 register format */
+	__BITFIELD_FIELD(unsigned int opcode : 6,
+	__BITFIELD_FIELD(unsigned int rs : 5,
+	__BITFIELD_FIELD(unsigned int rt : 5,
+	__BITFIELD_FIELD(unsigned int rd : 5,
+	__BITFIELD_FIELD(unsigned int re : 5,
+	__BITFIELD_FIELD(unsigned int sc : 1,
+	__BITFIELD_FIELD(unsigned int : 2,
+	__BITFIELD_FIELD(unsigned int sel : 3,
+	;))))))))
+};
+
+struct co_format {			/* C0 CO format */
+	__BITFIELD_FIELD(unsigned int opcode : 6,
+	__BITFIELD_FIELD(unsigned int co : 1,
+	__BITFIELD_FIELD(unsigned int code : 19,
+	__BITFIELD_FIELD(unsigned int func : 6,
+	;))))
+};
+
 struct p_format {		/* Performance counter format (R10000) */
 	__BITFIELD_FIELD(unsigned int opcode : 6,
 	__BITFIELD_FIELD(unsigned int rs : 5,
@@ -937,6 +967,9 @@ union mips_instruction {
 	struct u_format u_format;
 	struct c_format c_format;
 	struct r_format r_format;
+	struct c0r_format c0r_format;
+	struct mfmc0_format mfmc0_format;
+	struct co_format co_format;
 	struct p_format p_format;
 	struct f_format f_format;
 	struct ma_format ma_format;
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index eb6e0d17a6682f..a3031dae8d1bb5 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -20,21 +20,14 @@
 
 #include "commpage.h"
 
-#define SYNCI_TEMPLATE  0x041f0000
-#define SYNCI_BASE(x)   (((x) >> 21) & 0x1f)
-#define SYNCI_OFFSET    ((x) & 0xffff)
-
-#define LW_TEMPLATE     0x8c000000
-#define CLEAR_TEMPLATE  0x00000020
-#define SW_TEMPLATE     0xac000000
-
 /**
  * kvm_mips_trans_replace() - Replace trapping instruction in guest memory.
  * @vcpu:	Virtual CPU.
  * @opc:	PC of instruction to replace.
  * @replace:	Instruction to write
  */
-static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, u32 replace)
+static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
+				  union mips_instruction replace)
 {
 	unsigned long kseg0_opc, flags;
 
@@ -58,63 +51,68 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, u32 replace)
 	return 0;
 }
 
-int kvm_mips_trans_cache_index(u32 inst, u32 *opc,
+int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc,
 			       struct kvm_vcpu *vcpu)
 {
+	union mips_instruction nop_inst = { 0 };
+
 	/* Replace the CACHE instruction, with a NOP */
-	return kvm_mips_trans_replace(vcpu, opc, 0x00000000);
+	return kvm_mips_trans_replace(vcpu, opc, nop_inst);
 }
 
 /*
  * Address based CACHE instructions are transformed into synci(s). A little
  * heavy for just D-cache invalidates, but avoids an expensive trap
  */
-int kvm_mips_trans_cache_va(u32 inst, u32 *opc,
+int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
 			    struct kvm_vcpu *vcpu)
 {
-	u32 synci_inst = SYNCI_TEMPLATE, base, offset;
+	union mips_instruction synci_inst = { 0 };
 
-	base = (inst >> 21) & 0x1f;
-	offset = inst & 0xffff;
-	synci_inst |= (base << 21);
-	synci_inst |= offset;
+	synci_inst.i_format.opcode = bcond_op;
+	synci_inst.i_format.rs = inst.i_format.rs;
+	synci_inst.i_format.rt = synci_op;
+	synci_inst.i_format.simmediate = inst.i_format.simmediate;
 
 	return kvm_mips_trans_replace(vcpu, opc, synci_inst);
 }
 
-int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
+			struct kvm_vcpu *vcpu)
 {
-	u32 rt, rd, sel;
-	u32 mfc0_inst;
+	union mips_instruction mfc0_inst = { 0 };
+	u32 rd, sel;
 
-	rt = (inst >> 16) & 0x1f;
-	rd = (inst >> 11) & 0x1f;
-	sel = inst & 0x7;
+	rd = inst.c0r_format.rd;
+	sel = inst.c0r_format.sel;
 
-	if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
-		mfc0_inst = CLEAR_TEMPLATE;
-		mfc0_inst |= ((rt & 0x1f) << 11);
+	if (rd == MIPS_CP0_ERRCTL && sel == 0) {
+		mfc0_inst.r_format.opcode = spec_op;
+		mfc0_inst.r_format.rd = inst.c0r_format.rt;
+		mfc0_inst.r_format.func = add_op;
 	} else {
-		mfc0_inst = LW_TEMPLATE;
-		mfc0_inst |= ((rt & 0x1f) << 16);
-		mfc0_inst |= offsetof(struct kvm_mips_commpage,
-				      cop0.reg[rd][sel]);
+		mfc0_inst.i_format.opcode = lw_op;
+		mfc0_inst.i_format.rt = inst.c0r_format.rt;
+		mfc0_inst.i_format.simmediate =
+			offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
 	}
 
 	return kvm_mips_trans_replace(vcpu, opc, mfc0_inst);
 }
 
-int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
+			struct kvm_vcpu *vcpu)
 {
-	u32 rt, rd, sel;
-	u32 mtc0_inst = SW_TEMPLATE;
+	union mips_instruction mtc0_inst = { 0 };
+	u32 rd, sel;
 
-	rt = (inst >> 16) & 0x1f;
-	rd = (inst >> 11) & 0x1f;
-	sel = inst & 0x7;
+	rd = inst.c0r_format.rd;
+	sel = inst.c0r_format.sel;
 
-	mtc0_inst |= ((rt & 0x1f) << 16);
-	mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+	mtc0_inst.i_format.opcode = sw_op;
+	mtc0_inst.i_format.rt = inst.c0r_format.rt;
+	mtc0_inst.i_format.simmediate =
+		offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
 
 	return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
 }
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index ff4072c2b25e36..80bb6212a06735 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -972,13 +972,14 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu)
 	return mask;
 }
 
-enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+					   u32 *opc, u32 cause,
 					   struct kvm_run *run,
 					   struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	enum emulation_result er = EMULATE_DONE;
-	u32 rt, rd, copz, sel, co_bit, op;
+	u32 rt, rd, sel;
 	unsigned long curr_pc;
 
 	/*
@@ -990,16 +991,8 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 	if (er == EMULATE_FAIL)
 		return er;
 
-	copz = (inst >> 21) & 0x1f;
-	rt = (inst >> 16) & 0x1f;
-	rd = (inst >> 11) & 0x1f;
-	sel = inst & 0x7;
-	co_bit = (inst >> 25) & 1;
-
-	if (co_bit) {
-		op = (inst) & 0xff;
-
-		switch (op) {
+	if (inst.co_format.co) {
+		switch (inst.co_format.func) {
 		case tlbr_op:	/*  Read indexed TLB entry  */
 			er = kvm_mips_emul_tlbr(vcpu);
 			break;
@@ -1018,13 +1011,16 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 		case eret_op:
 			er = kvm_mips_emul_eret(vcpu);
 			goto dont_update_pc;
-			break;
 		case wait_op:
 			er = kvm_mips_emul_wait(vcpu);
 			break;
 		}
 	} else {
-		switch (copz) {
+		rt = inst.c0r_format.rt;
+		rd = inst.c0r_format.rd;
+		sel = inst.c0r_format.sel;
+
+		switch (inst.c0r_format.rs) {
 		case mfc_op:
 #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
 			cop0->stat[rd][sel]++;
@@ -1258,7 +1254,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 				vcpu->arch.gprs[rt] =
 				    kvm_read_c0_guest_status(cop0);
 			/* EI */
-			if (inst & 0x20) {
+			if (inst.mfmc0_format.sc) {
 				kvm_debug("[%#lx] mfmc0_op: EI\n",
 					  vcpu->arch.pc);
 				kvm_set_c0_guest_status(cop0, ST0_IE);
@@ -1290,7 +1286,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 			break;
 		default:
 			kvm_err("[%#lx]MachEmulateCP0: unsupported COP0, copz: 0x%x\n",
-				vcpu->arch.pc, copz);
+				vcpu->arch.pc, inst.c0r_format.rs);
 			er = EMULATE_FAIL;
 			break;
 		}
@@ -1311,13 +1307,13 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+					     u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DO_MMIO;
-	u32 op, base, rt;
-	s16 offset;
+	u32 rt;
 	u32 bytes;
 	void *data = run->mmio.data;
 	unsigned long curr_pc;
@@ -1331,12 +1327,9 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 	if (er == EMULATE_FAIL)
 		return er;
 
-	rt = (inst >> 16) & 0x1f;
-	base = (inst >> 21) & 0x1f;
-	offset = (s16)inst;
-	op = (inst >> 26) & 0x3f;
+	rt = inst.i_format.rt;
 
-	switch (op) {
+	switch (inst.i_format.opcode) {
 	case sb_op:
 		bytes = 1;
 		if (bytes > sizeof(run->mmio.data)) {
@@ -1413,7 +1406,7 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 
 	default:
 		kvm_err("Store not yet supported (inst=0x%08x)\n",
-			inst);
+			inst.word);
 		er = EMULATE_FAIL;
 		break;
 	}
@@ -1425,19 +1418,16 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause,
-					    struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+					    u32 cause, struct kvm_run *run,
 					    struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DO_MMIO;
-	u32 op, base, rt;
-	s16 offset;
+	u32 op, rt;
 	u32 bytes;
 
-	rt = (inst >> 16) & 0x1f;
-	base = (inst >> 21) & 0x1f;
-	offset = (s16)inst;
-	op = (inst >> 26) & 0x3f;
+	rt = inst.i_format.rt;
+	op = inst.i_format.opcode;
 
 	vcpu->arch.pending_load_cause = cause;
 	vcpu->arch.io_gpr = rt;
@@ -1524,7 +1514,7 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause,
 
 	default:
 		kvm_err("Load not yet supported (inst=0x%08x)\n",
-			inst);
+			inst.word);
 		er = EMULATE_FAIL;
 		break;
 	}
@@ -1532,8 +1522,8 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause,
 	return er;
 }
 
-enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc,
-					     u32 cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+					     u32 *opc, u32 cause,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
@@ -1554,9 +1544,9 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc,
 	if (er == EMULATE_FAIL)
 		return er;
 
-	base = (inst >> 21) & 0x1f;
-	op_inst = (inst >> 16) & 0x1f;
-	offset = (s16)inst;
+	base = inst.i_format.rs;
+	op_inst = inst.i_format.rt;
+	offset = inst.i_format.simmediate;
 	cache = op_inst & CacheOp_Cache;
 	op = op_inst & CacheOp_Op;
 
@@ -1693,16 +1683,16 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu)
 {
+	union mips_instruction inst;
 	enum emulation_result er = EMULATE_DONE;
-	u32 inst;
 
 	/* Fetch the instruction. */
 	if (cause & CAUSEF_BD)
 		opc += 1;
 
-	inst = kvm_get_inst(opc, vcpu);
+	inst.word = kvm_get_inst(opc, vcpu);
 
-	switch (((union mips_instruction)inst).r_format.opcode) {
+	switch (inst.r_format.opcode) {
 	case cop0_op:
 		er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
 		break;
@@ -1727,7 +1717,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 
 	default:
 		kvm_err("Instruction emulation not supported (%p/%#x)\n", opc,
-			inst);
+			inst.word);
 		kvm_arch_vcpu_dump_regs(vcpu);
 		er = EMULATE_FAIL;
 		break;
@@ -2262,21 +2252,6 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
 	return er;
 }
 
-/* ll/sc, rdhwr, sync emulation */
-
-#define OPCODE 0xfc000000
-#define BASE   0x03e00000
-#define RT     0x001f0000
-#define OFFSET 0x0000ffff
-#define LL     0xc0000000
-#define SC     0xe0000000
-#define SPEC0  0x00000000
-#define SPEC3  0x7c000000
-#define RD     0x0000f800
-#define FUNC   0x0000003f
-#define SYNC   0x0000000f
-#define RDHWR  0x0000003b
-
 enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 					 struct kvm_run *run,
 					 struct kvm_vcpu *vcpu)
@@ -2285,7 +2260,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	struct kvm_vcpu_arch *arch = &vcpu->arch;
 	enum emulation_result er = EMULATE_DONE;
 	unsigned long curr_pc;
-	u32 inst;
+	union mips_instruction inst;
 
 	/*
 	 * Update PC and hold onto current PC in case there is
@@ -2300,18 +2275,19 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	if (cause & CAUSEF_BD)
 		opc += 1;
 
-	inst = kvm_get_inst(opc, vcpu);
+	inst.word = kvm_get_inst(opc, vcpu);
 
-	if (inst == KVM_INVALID_INST) {
+	if (inst.word == KVM_INVALID_INST) {
 		kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
 		return EMULATE_FAIL;
 	}
 
-	if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+	if (inst.r_format.opcode == spec3_op &&
+	    inst.r_format.func == rdhwr_op) {
 		int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
-		int rd = (inst & RD) >> 11;
-		int rt = (inst & RT) >> 16;
-		int sel = (inst >> 6) & 0x7;
+		int rd = inst.r_format.rd;
+		int rt = inst.r_format.rt;
+		int sel = inst.r_format.re & 0x7;
 
 		/* If usermode, check RDHWR rd is allowed by guest HWREna */
 		if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
@@ -2352,7 +2328,8 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 		trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel),
 			      vcpu->arch.gprs[rt]);
 	} else {
-		kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+		kvm_debug("Emulate RI not supported @ %p: %#x\n",
+			  opc, inst.word);
 		goto emulate_ri;
 	}
 

From cc68d22f9727d02c1d981d27c11389fd9413e419 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:48 +0100
Subject: [PATCH 090/302] MIPS: KVM: Pass all unknown registers to callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pass all unrecognised register IDs through to the set_one_reg() and
get_one_reg() callbacks, not just select ones. This allows
implementation specific registers to be more easily added without having
to modify arch/mips/kvm/mips.c.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index b5ad2ba1847ab4..fe82f3354c2324 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -688,16 +688,11 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 		v = (long)kvm_read_c0_guest_errorepc(cop0);
 		break;
 	/* registers to be handled specially */
-	case KVM_REG_MIPS_CP0_COUNT:
-	case KVM_REG_MIPS_COUNT_CTL:
-	case KVM_REG_MIPS_COUNT_RESUME:
-	case KVM_REG_MIPS_COUNT_HZ:
+	default:
 		ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
 		if (ret)
 			return ret;
 		break;
-	default:
-		return -EINVAL;
 	}
 	if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) {
 		u64 __user *uaddr64 = (u64 __user *)(long)reg->addr;
@@ -859,21 +854,8 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 		kvm_write_c0_guest_errorepc(cop0, v);
 		break;
 	/* registers to be handled specially */
-	case KVM_REG_MIPS_CP0_COUNT:
-	case KVM_REG_MIPS_CP0_COMPARE:
-	case KVM_REG_MIPS_CP0_CAUSE:
-	case KVM_REG_MIPS_CP0_CONFIG:
-	case KVM_REG_MIPS_CP0_CONFIG1:
-	case KVM_REG_MIPS_CP0_CONFIG2:
-	case KVM_REG_MIPS_CP0_CONFIG3:
-	case KVM_REG_MIPS_CP0_CONFIG4:
-	case KVM_REG_MIPS_CP0_CONFIG5:
-	case KVM_REG_MIPS_COUNT_CTL:
-	case KVM_REG_MIPS_COUNT_RESUME:
-	case KVM_REG_MIPS_COUNT_HZ:
-		return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
 	default:
-		return -EINVAL;
+		return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
 	}
 	return 0;
 }

From f5c43bd4218c0d7bd65b010fd080cd6edeaeb4c8 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:49 +0100
Subject: [PATCH 091/302] MIPS: KVM: Make KVM_GET_REG_LIST dynamic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the implementation of KVM_GET_REG_LIST more dynamic so that only
the subset of registers actually available can be exposed to user mode.
This is important for VZ where some of the guest register state may not
be possible to prevent the guest from accessing, therefore the user
process may need to be aware of the state even if it doesn't understand
what the state is for.

This also allows different MIPS KVM implementations to provide different
registers to one another, by way of new num_regs(vcpu) and
copy_reg_indices(vcpu, indices) callback functions, currently just
stubbed for trap & emulate.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |  2 ++
 arch/mips/kvm/mips.c             | 29 ++++++++++++++++++++++-------
 arch/mips/kvm/trap_emul.c        | 13 +++++++++++++
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 1e002136f514b6..38f0491fcb2ffd 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -560,6 +560,8 @@ struct kvm_mips_callbacks {
 			   u32 cause);
 	int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority,
 			 u32 cause);
+	unsigned long (*num_regs)(struct kvm_vcpu *vcpu);
+	int (*copy_reg_indices)(struct kvm_vcpu *vcpu, u64 __user *indices);
 	int (*get_one_reg)(struct kvm_vcpu *vcpu,
 			   const struct kvm_one_reg *reg, s64 *v);
 	int (*set_one_reg)(struct kvm_vcpu *vcpu,
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index fe82f3354c2324..2c4709a09b78d5 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -538,6 +538,26 @@ static u64 kvm_mips_get_one_regs[] = {
 	KVM_REG_MIPS_COUNT_HZ,
 };
 
+static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
+{
+	unsigned long ret;
+
+	ret = ARRAY_SIZE(kvm_mips_get_one_regs);
+	ret += kvm_mips_callbacks->num_regs(vcpu);
+
+	return ret;
+}
+
+static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+	if (copy_to_user(indices, kvm_mips_get_one_regs,
+			 sizeof(kvm_mips_get_one_regs)))
+		return -EFAULT;
+	indices += ARRAY_SIZE(kvm_mips_get_one_regs);
+
+	return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
+}
+
 static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 			    const struct kvm_one_reg *reg)
 {
@@ -908,23 +928,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 	}
 	case KVM_GET_REG_LIST: {
 		struct kvm_reg_list __user *user_list = argp;
-		u64 __user *reg_dest;
 		struct kvm_reg_list reg_list;
 		unsigned n;
 
 		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
 			return -EFAULT;
 		n = reg_list.n;
-		reg_list.n = ARRAY_SIZE(kvm_mips_get_one_regs);
+		reg_list.n = kvm_mips_num_regs(vcpu);
 		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
 			return -EFAULT;
 		if (n < reg_list.n)
 			return -E2BIG;
-		reg_dest = user_list->reg;
-		if (copy_to_user(reg_dest, kvm_mips_get_one_regs,
-				 sizeof(kvm_mips_get_one_regs)))
-			return -EFAULT;
-		return 0;
+		return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
 	}
 	case KVM_NMI:
 		/* Treat the NMI as a CPU reset */
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 09b97fa9dabb7d..b64ca1a222f72b 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -478,6 +478,17 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
+					  u64 __user *indices)
+{
+	return 0;
+}
+
 static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
 				     const struct kvm_one_reg *reg,
 				     s64 *v)
@@ -627,6 +638,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.dequeue_io_int = kvm_mips_dequeue_io_int_cb,
 	.irq_deliver = kvm_mips_irq_deliver_cb,
 	.irq_clear = kvm_mips_irq_clear_cb,
+	.num_regs = kvm_trap_emul_num_regs,
+	.copy_reg_indices = kvm_trap_emul_copy_reg_indices,
 	.get_one_reg = kvm_trap_emul_get_one_reg,
 	.set_one_reg = kvm_trap_emul_set_one_reg,
 	.vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,

From 19451e51012fa49070252b1b8157460d36618cee Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:50 +0100
Subject: [PATCH 092/302] MIPS: KVM: Use raw_cpu_has_fpu in
 kvm_mips_guest_can_have_fpu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to use kvm_mips_guest_can_have_fpu() when deciding which
registers to list with KVM_GET_REG_LIST, however it causes warnings with
preemption since it uses cpu_has_fpu. KVM is only really supported on
CPUs which have symmetric FPUs, so switch to raw_cpu_has_fpu to avoid
the warning.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 38f0491fcb2ffd..f12eb01a319591 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -510,7 +510,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
 {
-	return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) &&
+	return (!__builtin_constant_p(raw_cpu_has_fpu) || raw_cpu_has_fpu) &&
 		vcpu->fpu_enabled;
 }
 

From e57759306c44ba6105c04eafc3b22efc55bb7ad2 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:51 +0100
Subject: [PATCH 093/302] MIPS: KVM: List FPU/MSA registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make KVM_GET_REG_LIST list FPU & MSA registers. Specifically we list all
32 vector registers when MSA can be enabled, 32 single-precision FP
registers when FPU can be enabled, and either 16 or 32 double-precision
FP registers when FPU can be enabled depending on whether FR mode is
supported (which provides 32 doubles instead of 16 even doubles).

Note, these registers may still be inaccessible depending on the current
FP mode of the guest.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c | 58 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 2c4709a09b78d5..622b9feba9273b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -538,11 +538,29 @@ static u64 kvm_mips_get_one_regs[] = {
 	KVM_REG_MIPS_COUNT_HZ,
 };
 
+static u64 kvm_mips_get_one_regs_fpu[] = {
+	KVM_REG_MIPS_FCR_IR,
+	KVM_REG_MIPS_FCR_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_msa[] = {
+	KVM_REG_MIPS_MSA_IR,
+	KVM_REG_MIPS_MSA_CSR,
+};
+
 static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 {
 	unsigned long ret;
 
 	ret = ARRAY_SIZE(kvm_mips_get_one_regs);
+	if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+		ret += ARRAY_SIZE(kvm_mips_get_one_regs_fpu) + 48;
+		/* odd doubles */
+		if (boot_cpu_data.fpu_id & MIPS_FPIR_F64)
+			ret += 16;
+	}
+	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+		ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
 	ret += kvm_mips_callbacks->num_regs(vcpu);
 
 	return ret;
@@ -550,11 +568,51 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 
 static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 {
+	u64 index;
+	unsigned int i;
+
 	if (copy_to_user(indices, kvm_mips_get_one_regs,
 			 sizeof(kvm_mips_get_one_regs)))
 		return -EFAULT;
 	indices += ARRAY_SIZE(kvm_mips_get_one_regs);
 
+	if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+		if (copy_to_user(indices, kvm_mips_get_one_regs_fpu,
+				 sizeof(kvm_mips_get_one_regs_fpu)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_mips_get_one_regs_fpu);
+
+		for (i = 0; i < 32; ++i) {
+			index = KVM_REG_MIPS_FPR_32(i);
+			if (copy_to_user(indices, &index, sizeof(index)))
+				return -EFAULT;
+			++indices;
+
+			/* skip odd doubles if no F64 */
+			if (i & 1 && !(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+				continue;
+
+			index = KVM_REG_MIPS_FPR_64(i);
+			if (copy_to_user(indices, &index, sizeof(index)))
+				return -EFAULT;
+			++indices;
+		}
+	}
+
+	if (kvm_mips_guest_can_have_msa(&vcpu->arch)) {
+		if (copy_to_user(indices, kvm_mips_get_one_regs_msa,
+				 sizeof(kvm_mips_get_one_regs_msa)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_mips_get_one_regs_msa);
+
+		for (i = 0; i < 32; ++i) {
+			index = KVM_REG_MIPS_VEC_128(i);
+			if (copy_to_user(indices, &index, sizeof(index)))
+				return -EFAULT;
+			++indices;
+		}
+	}
+
 	return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
 }
 

From aff565aab961d3cab3299a7008af6cdef88b79a0 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:52 +0100
Subject: [PATCH 094/302] MIPS: Clean up RDHWR handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No preprocessor definitions are used in the handling of the registers
accessible with the RDHWR instruction, nor the corresponding bits in the
CP0 HWREna register.

Add definitions for both the register numbers (MIPS_HWR_*) and HWREna
bits (MIPS_HWRENA_*) in asm/mipsregs.h and make use of them in the
initialisation of HWREna and emulation of the RDHWR instruction.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../cpu-feature-overrides.h                   |  2 +-
 arch/mips/include/asm/mipsregs.h              | 20 ++++++++++++++++++-
 arch/mips/kernel/traps.c                      | 17 +++++++++-------
 arch/mips/kvm/emulate.c                       | 10 +++++-----
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
index d68e685cde6032..bd8b9bbe17719d 100644
--- a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
+++ b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
@@ -55,7 +55,7 @@
 #define cpu_has_mipsmt		0
 #define cpu_has_vint		0
 #define cpu_has_veic		0
-#define cpu_hwrena_impl_bits	0xc0000000
+#define cpu_hwrena_impl_bits	(MIPS_HWRENA_IMPL1 | MIPS_HWRENA_IMPL2)
 #define cpu_has_wsbh            1
 
 #define cpu_has_rixi		(cpu_data[0].cputype != CPU_CAVIUM_OCTEON)
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index e1ca65c62f6a54..8b1b37d50d158f 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -53,7 +53,7 @@
 #define CP0_SEGCTL2 $5, 4
 #define CP0_WIRED $6
 #define CP0_INFO $7
-#define CP0_HWRENA $7, 0
+#define CP0_HWRENA $7
 #define CP0_BADVADDR $8
 #define CP0_BADINSTR $8, 1
 #define CP0_COUNT $9
@@ -853,6 +853,24 @@
 #define MIPS_CDMMBASE_ADDR_SHIFT 11
 #define MIPS_CDMMBASE_ADDR_START 15
 
+/* RDHWR register numbers */
+#define MIPS_HWR_CPUNUM		0	/* CPU number */
+#define MIPS_HWR_SYNCISTEP	1	/* SYNCI step size */
+#define MIPS_HWR_CC		2	/* Cycle counter */
+#define MIPS_HWR_CCRES		3	/* Cycle counter resolution */
+#define MIPS_HWR_ULR		29	/* UserLocal */
+#define MIPS_HWR_IMPL1		30	/* Implementation dependent */
+#define MIPS_HWR_IMPL2		31	/* Implementation dependent */
+
+/* Bits in HWREna register */
+#define MIPS_HWRENA_CPUNUM	(_ULCAST_(1) << MIPS_HWR_CPUNUM)
+#define MIPS_HWRENA_SYNCISTEP	(_ULCAST_(1) << MIPS_HWR_SYNCISTEP)
+#define MIPS_HWRENA_CC		(_ULCAST_(1) << MIPS_HWR_CC)
+#define MIPS_HWRENA_CCRES	(_ULCAST_(1) << MIPS_HWR_CCRES)
+#define MIPS_HWRENA_ULR		(_ULCAST_(1) << MIPS_HWR_ULR)
+#define MIPS_HWRENA_IMPL1	(_ULCAST_(1) << MIPS_HWR_IMPL1)
+#define MIPS_HWRENA_IMPL2	(_ULCAST_(1) << MIPS_HWR_IMPL2)
+
 /*
  * Bitfields in the TX39 family CP0 Configuration Register 3
  */
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 66e5820bfdae43..7176a6057e26d6 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -619,17 +619,17 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
 	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
 			1, regs, 0);
 	switch (rd) {
-	case 0:		/* CPU number */
+	case MIPS_HWR_CPUNUM:		/* CPU number */
 		regs->regs[rt] = smp_processor_id();
 		return 0;
-	case 1:		/* SYNCI length */
+	case MIPS_HWR_SYNCISTEP:	/* SYNCI length */
 		regs->regs[rt] = min(current_cpu_data.dcache.linesz,
 				     current_cpu_data.icache.linesz);
 		return 0;
-	case 2:		/* Read count register */
+	case MIPS_HWR_CC:		/* Read count register */
 		regs->regs[rt] = read_c0_count();
 		return 0;
-	case 3:		/* Count register resolution */
+	case MIPS_HWR_CCRES:		/* Count register resolution */
 		switch (current_cpu_type()) {
 		case CPU_20KC:
 		case CPU_25KF:
@@ -639,7 +639,7 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
 			regs->regs[rt] = 2;
 		}
 		return 0;
-	case 29:
+	case MIPS_HWR_ULR:		/* Read UserLocal register */
 		regs->regs[rt] = ti->tp_value;
 		return 0;
 	default:
@@ -2070,10 +2070,13 @@ static void configure_hwrena(void)
 	unsigned int hwrena = cpu_hwrena_impl_bits;
 
 	if (cpu_has_mips_r2_r6)
-		hwrena |= 0x0000000f;
+		hwrena |= MIPS_HWRENA_CPUNUM |
+			  MIPS_HWRENA_SYNCISTEP |
+			  MIPS_HWRENA_CC |
+			  MIPS_HWRENA_CCRES;
 
 	if (!noulri && cpu_has_userlocal)
-		hwrena |= (1 << 29);
+		hwrena |= MIPS_HWRENA_ULR;
 
 	if (hwrena)
 		write_c0_hwrena(hwrena);
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 80bb6212a06735..892f36f56d32e6 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -2296,17 +2296,17 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 			goto emulate_ri;
 		}
 		switch (rd) {
-		case 0:	/* CPU number */
+		case MIPS_HWR_CPUNUM:		/* CPU number */
 			arch->gprs[rt] = 0;
 			break;
-		case 1:	/* SYNCI length */
+		case MIPS_HWR_SYNCISTEP:	/* SYNCI length */
 			arch->gprs[rt] = min(current_cpu_data.dcache.linesz,
 					     current_cpu_data.icache.linesz);
 			break;
-		case 2:	/* Read count register */
+		case MIPS_HWR_CC:		/* Read count register */
 			arch->gprs[rt] = kvm_mips_read_count(vcpu);
 			break;
-		case 3:	/* Count register resolution */
+		case MIPS_HWR_CCRES:		/* Count register resolution */
 			switch (current_cpu_data.cputype) {
 			case CPU_20KC:
 			case CPU_25KF:
@@ -2316,7 +2316,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 				arch->gprs[rt] = 2;
 			}
 			break;
-		case 29:
+		case MIPS_HWR_ULR:		/* Read UserLocal register */
 			arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
 			break;
 

From b937ff628fa76b242a74cb9087df972d5f1cecbb Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:53 +0100
Subject: [PATCH 095/302] MIPS: KVM: Don't hardcode restored HWREna
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM modifies CP0_HWREna during guest execution so it can trap and
emulate RDHWR instructions, however it always restores the hardcoded
value 0x2000000F. This assumes the presence of the UserLocal register,
and the absence of any implementation dependent or future HW registers.

Fix by exporting the value that traps.c write into CP0_HWREna, and
loading from there instead of hard coding.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/setup.h | 1 +
 arch/mips/kernel/traps.c      | 5 ++++-
 arch/mips/kvm/locore.S        | 4 ++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h
index d7bfdeba9e845a..4f5279a8308d7c 100644
--- a/arch/mips/include/asm/setup.h
+++ b/arch/mips/include/asm/setup.h
@@ -21,6 +21,7 @@ extern void *set_vi_handler(int n, vi_handler_t addr);
 
 extern void *set_except_vector(int n, void *addr);
 extern unsigned long ebase;
+extern unsigned int hwrena;
 extern void per_cpu_trap_init(bool);
 extern void cpu_cache_init(void);
 
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 7176a6057e26d6..6fb4704bd156ad 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2064,10 +2064,13 @@ static void configure_status(void)
 			 status_set);
 }
 
+unsigned int hwrena;
+EXPORT_SYMBOL_GPL(hwrena);
+
 /* configure HWRENA register */
 static void configure_hwrena(void)
 {
-	unsigned int hwrena = cpu_hwrena_impl_bits;
+	hwrena = cpu_hwrena_impl_bits;
 
 	if (cpu_has_mips_r2_r6)
 		hwrena |= MIPS_HWRENA_CPUNUM |
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
index f87bec546366f2..698286c0f7323f 100644
--- a/arch/mips/kvm/locore.S
+++ b/arch/mips/kvm/locore.S
@@ -381,7 +381,7 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
 	mtc0	k0, CP0_DDATA_LO
 
 	/* Restore RDHWR access */
-	PTR_LI	k0, 0x2000000F
+	INT_L	k0, hwrena
 	mtc0	k0, CP0_HWRENA
 
 	/* Jump to handler */
@@ -553,7 +553,7 @@ __kvm_mips_return_to_host:
 	mtlo	k0
 
 	/* Restore RDHWR access */
-	PTR_LI	k0, 0x2000000F
+	INT_L	k0, hwrena
 	mtc0	k0, CP0_HWRENA
 
 	/* Restore RA, which is the address we will return to */

From cef061d086b1de75445cba63af5306f98fb52f4b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:54 +0100
Subject: [PATCH 096/302] MIPS: KVM: Allow ULRI to restrict UserLocal register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ULRI bit in Config3 specifies whether the UserLocal register is
implemented, but it is assumed to always be set. Now that the Config
registers can be modified by userland, allow Config3.ULRI to be cleared
and check ULRI before allowing the corresponding bit to be set in
HWREna.

In fact any HWREna bits corresponding to unimplemented RDHWR registers
should read as zero and be ignored on write, so we actually prevent
other unimplemented bits being set too.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 892f36f56d32e6..84f435bf74bd38 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -921,8 +921,8 @@ unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu)
  */
 unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
 {
-	/* Config4 is optional */
-	unsigned int mask = MIPS_CONF_M;
+	/* Config4 and ULRI are optional */
+	unsigned int mask = MIPS_CONF_M | MIPS_CONF3_ULRI;
 
 	/* Permit MSA to be present if MSA is supported */
 	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
@@ -1229,6 +1229,16 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					else
 						kvm_mips_count_enable_cause(vcpu);
 				}
+			} else if ((rd == MIPS_CP0_HWRENA) && (sel == 0)) {
+				u32 mask = MIPS_HWRENA_CPUNUM |
+					   MIPS_HWRENA_SYNCISTEP |
+					   MIPS_HWRENA_CC |
+					   MIPS_HWRENA_CCRES;
+
+				if (kvm_read_c0_guest_config3(cop0) &
+				    MIPS_CONF3_ULRI)
+					mask |= MIPS_HWRENA_ULR;
+				cop0->reg[rd][sel] = vcpu->arch.gprs[rt] & mask;
 			} else {
 				cop0->reg[rd][sel] = vcpu->arch.gprs[rt];
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS

From cf1fb0f29d83bcc1d8af912e56f2295397372908 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:55 +0100
Subject: [PATCH 097/302] MIPS: KVM: Emulate RDHWR CPUNum register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Actually provide the VCPU number when emulating the RDHWR CPUNum
register, so that it will match the CPUNum field of CP0_EBase register,
rather than always returning 0.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 84f435bf74bd38..4ca5450febbb8c 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -2307,7 +2307,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 		}
 		switch (rd) {
 		case MIPS_HWR_CPUNUM:		/* CPU number */
-			arch->gprs[rt] = 0;
+			arch->gprs[rt] = vcpu->vcpu_id;
 			break;
 		case MIPS_HWR_SYNCISTEP:	/* SYNCI length */
 			arch->gprs[rt] = min(current_cpu_data.dcache.linesz,

From 05108709526716e1d40210fe3b9d7acd1cb694ea Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:56 +0100
Subject: [PATCH 098/302] MIPS: KVM: Add KScratch registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow up to 6 KVM guest KScratch registers to be enabled and accessed
via the KVM guest register API and from the guest itself (the fallback
reading and writing of commpage registers is sufficient for KScratch
registers to work as expected).

User mode can expose the registers by setting the appropriate bits of
the guest Config4.KScrExist field. KScratch registers that aren't usable
won't be writeable via the KVM Ioctl API.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  6 +++
 arch/mips/include/asm/kvm_host.h  | 19 +++++++++
 arch/mips/kvm/emulate.c           |  7 ++-
 arch/mips/kvm/mips.c              | 71 +++++++++++++++++++++++++++++++
 arch/mips/kvm/trace.h             |  6 +++
 arch/mips/kvm/trap_emul.c         |  2 +
 6 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4aac3e51bf9f67..09efa9eb3926d5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2032,6 +2032,12 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONFIG5      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG7      | 32
   MIPS  | KVM_REG_MIPS_CP0_ERROREPC     | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH1    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH2    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH3    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH4    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH5    | 64
+  MIPS  | KVM_REG_MIPS_CP0_KSCRATCH6    | 64
   MIPS  | KVM_REG_MIPS_COUNT_CTL        | 64
   MIPS  | KVM_REG_MIPS_COUNT_RESUME     | 64
   MIPS  | KVM_REG_MIPS_COUNT_HZ         | 64
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index f12eb01a319591..5e9da2a31fde02 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -56,6 +56,12 @@
 #define KVM_REG_MIPS_CP0_CONFIG7	MIPS_CP0_32(16, 7)
 #define KVM_REG_MIPS_CP0_XCONTEXT	MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC	MIPS_CP0_64(30, 0)
+#define KVM_REG_MIPS_CP0_KSCRATCH1	MIPS_CP0_64(31, 2)
+#define KVM_REG_MIPS_CP0_KSCRATCH2	MIPS_CP0_64(31, 3)
+#define KVM_REG_MIPS_CP0_KSCRATCH3	MIPS_CP0_64(31, 4)
+#define KVM_REG_MIPS_CP0_KSCRATCH4	MIPS_CP0_64(31, 5)
+#define KVM_REG_MIPS_CP0_KSCRATCH5	MIPS_CP0_64(31, 6)
+#define KVM_REG_MIPS_CP0_KSCRATCH6	MIPS_CP0_64(31, 7)
 
 
 #define KVM_MAX_VCPUS		1
@@ -376,6 +382,7 @@ struct kvm_vcpu_arch {
 
 	u8 fpu_enabled;
 	u8 msa_enabled;
+	u8 kscratch_enabled;
 };
 
 
@@ -429,6 +436,18 @@ struct kvm_vcpu_arch {
 #define kvm_write_c0_guest_config7(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][7] = (val))
 #define kvm_read_c0_guest_errorepc(cop0)	(cop0->reg[MIPS_CP0_ERROR_PC][0])
 #define kvm_write_c0_guest_errorepc(cop0, val)	(cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+#define kvm_read_c0_guest_kscratch1(cop0)	(cop0->reg[MIPS_CP0_DESAVE][2])
+#define kvm_read_c0_guest_kscratch2(cop0)	(cop0->reg[MIPS_CP0_DESAVE][3])
+#define kvm_read_c0_guest_kscratch3(cop0)	(cop0->reg[MIPS_CP0_DESAVE][4])
+#define kvm_read_c0_guest_kscratch4(cop0)	(cop0->reg[MIPS_CP0_DESAVE][5])
+#define kvm_read_c0_guest_kscratch5(cop0)	(cop0->reg[MIPS_CP0_DESAVE][6])
+#define kvm_read_c0_guest_kscratch6(cop0)	(cop0->reg[MIPS_CP0_DESAVE][7])
+#define kvm_write_c0_guest_kscratch1(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][2] = (val))
+#define kvm_write_c0_guest_kscratch2(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][3] = (val))
+#define kvm_write_c0_guest_kscratch3(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][4] = (val))
+#define kvm_write_c0_guest_kscratch4(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][5] = (val))
+#define kvm_write_c0_guest_kscratch5(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][6] = (val))
+#define kvm_write_c0_guest_kscratch6(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][7] = (val))
 
 /*
  * Some of the guest registers may be modified asynchronously (e.g. from a
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 4ca5450febbb8c..5f0354c80c8eb9 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -941,7 +941,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
 {
 	/* Config5 is optional */
-	return MIPS_CONF_M;
+	unsigned int mask = MIPS_CONF_M;
+
+	/* KScrExist */
+	mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+
+	return mask;
 }
 
 /**
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 622b9feba9273b..5a2b9034a05ce1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -9,6 +9,7 @@
  * Authors: Sanjay Lal <sanjayl@kymasys.com>
  */
 
+#include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kdebug.h>
@@ -548,6 +549,15 @@ static u64 kvm_mips_get_one_regs_msa[] = {
 	KVM_REG_MIPS_MSA_CSR,
 };
 
+static u64 kvm_mips_get_one_regs_kscratch[] = {
+	KVM_REG_MIPS_CP0_KSCRATCH1,
+	KVM_REG_MIPS_CP0_KSCRATCH2,
+	KVM_REG_MIPS_CP0_KSCRATCH3,
+	KVM_REG_MIPS_CP0_KSCRATCH4,
+	KVM_REG_MIPS_CP0_KSCRATCH5,
+	KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
 static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 {
 	unsigned long ret;
@@ -561,6 +571,7 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
 	}
 	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
 		ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
+	ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
 	ret += kvm_mips_callbacks->num_regs(vcpu);
 
 	return ret;
@@ -613,6 +624,16 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 		}
 	}
 
+	for (i = 0; i < 6; ++i) {
+		if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
+			continue;
+
+		if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
+				 sizeof(kvm_mips_get_one_regs_kscratch[i])))
+			return -EFAULT;
+		++indices;
+	}
+
 	return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
 }
 
@@ -765,6 +786,31 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_ERROREPC:
 		v = (long)kvm_read_c0_guest_errorepc(cop0);
 		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			v = (long)kvm_read_c0_guest_kscratch1(cop0);
+			break;
+		case 3:
+			v = (long)kvm_read_c0_guest_kscratch2(cop0);
+			break;
+		case 4:
+			v = (long)kvm_read_c0_guest_kscratch3(cop0);
+			break;
+		case 5:
+			v = (long)kvm_read_c0_guest_kscratch4(cop0);
+			break;
+		case 6:
+			v = (long)kvm_read_c0_guest_kscratch5(cop0);
+			break;
+		case 7:
+			v = (long)kvm_read_c0_guest_kscratch6(cop0);
+			break;
+		}
+		break;
 	/* registers to be handled specially */
 	default:
 		ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
@@ -931,6 +977,31 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_ERROREPC:
 		kvm_write_c0_guest_errorepc(cop0, v);
 		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			kvm_write_c0_guest_kscratch1(cop0, v);
+			break;
+		case 3:
+			kvm_write_c0_guest_kscratch2(cop0, v);
+			break;
+		case 4:
+			kvm_write_c0_guest_kscratch3(cop0, v);
+			break;
+		case 5:
+			kvm_write_c0_guest_kscratch4(cop0, v);
+			break;
+		case 6:
+			kvm_write_c0_guest_kscratch5(cop0, v);
+			break;
+		case 7:
+			kvm_write_c0_guest_kscratch6(cop0, v);
+			break;
+		}
+		break;
 	/* registers to be handled specially */
 	default:
 		return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index 5d712ecb07344b..a38bdab685745e 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -178,6 +178,12 @@ TRACE_EVENT(kvm_exit,
 	{ KVM_TRACE_COP0(16, 7),	"Config7" },		\
 	{ KVM_TRACE_COP0(26, 0),	"ECC" },		\
 	{ KVM_TRACE_COP0(30, 0),	"ErrorEPC" },		\
+	{ KVM_TRACE_COP0(31, 2),	"KScratch1" },		\
+	{ KVM_TRACE_COP0(31, 3),	"KScratch2" },		\
+	{ KVM_TRACE_COP0(31, 4),	"KScratch3" },		\
+	{ KVM_TRACE_COP0(31, 5),	"KScratch4" },		\
+	{ KVM_TRACE_COP0(31, 6),	"KScratch5" },		\
+	{ KVM_TRACE_COP0(31, 7),	"KScratch6" },		\
 	{ KVM_TRACE_HWR( 0, 0),		"CPUNum" },		\
 	{ KVM_TRACE_HWR( 1, 0),		"SYNCI_Step" },		\
 	{ KVM_TRACE_HWR( 2, 0),		"CC" },			\
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index b64ca1a222f72b..eb191c4612bb46 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -418,6 +418,8 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm)
 
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.kscratch_enabled = 0xfc;
+
 	return 0;
 }
 

From 42aa12e74e91f790d239bfb852260d07573ce83f Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:57 +0100
Subject: [PATCH 099/302] MIPS: KVM: Move commpage so 0x0 is unmapped
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The comm page which is mapped into the guest kernel address space at
0x0 has the unfortunate side effect of allowing guest kernel NULL
pointer dereferences to succeed. The only constraint on this address is
that it must be within 32KiB of 0x0, so that single lw/sw instructions
(which have 16-bit signed offset fields) can be used to access it, using
the zero register as a base.

So lets move the comm page as high as possible within that constraint so
that 0x0 can be left unmapped, at least for page sizes < 32KiB.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 10 ++++++++--
 arch/mips/kvm/commpage.c         |  2 +-
 arch/mips/kvm/dyntrans.c         |  4 ++--
 arch/mips/kvm/tlb.c              | 18 +++++++++---------
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5e9da2a31fde02..6c43c782bdfad1 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -74,8 +74,14 @@
 
 
 
-/* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR		0x0
+/*
+ * Special address that contains the comm page, used for reducing # of traps
+ * This needs to be within 32Kb of 0x0 (so the zero register can be used), but
+ * preferably not at 0x0 so that most kernel NULL pointer dereferences can be
+ * caught.
+ */
+#define KVM_GUEST_COMMPAGE_ADDR		((PAGE_SIZE > 0x8000) ?	0 : \
+					 (0x8000 - PAGE_SIZE))
 
 #define KVM_GUEST_KERNEL_MODE(vcpu)	((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
 					((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
diff --git a/arch/mips/kvm/commpage.c b/arch/mips/kvm/commpage.c
index 2d6e976d1add98..a36b77e1705c58 100644
--- a/arch/mips/kvm/commpage.c
+++ b/arch/mips/kvm/commpage.c
@@ -4,7 +4,7 @@
  * for more details.
  *
  * commpage, currently used for Virtual COP0 registers.
- * Mapped into the guest kernel @ 0x0.
+ * Mapped into the guest kernel @ KVM_GUEST_COMMPAGE_ADDR.
  *
  * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
  * Authors: Sanjay Lal <sanjayl@kymasys.com>
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index a3031dae8d1bb5..8a1833b9eb384d 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -93,7 +93,7 @@ int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
 	} else {
 		mfc0_inst.i_format.opcode = lw_op;
 		mfc0_inst.i_format.rt = inst.c0r_format.rt;
-		mfc0_inst.i_format.simmediate =
+		mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
 			offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
 	}
 
@@ -111,7 +111,7 @@ int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
 
 	mtc0_inst.i_format.opcode = sw_op;
 	mtc0_inst.i_format.rt = inst.c0r_format.rt;
-	mtc0_inst.i_format.simmediate =
+	mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
 		offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
 
 	return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 8012e686d4ae51..385fbd34e77dee 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -171,23 +171,23 @@ EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
 int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	struct kvm_vcpu *vcpu)
 {
-	kvm_pfn_t pfn0, pfn1;
+	kvm_pfn_t pfn;
 	unsigned long flags, old_entryhi = 0, vaddr = 0;
-	unsigned long entrylo0 = 0, entrylo1 = 0;
+	unsigned long entrylo[2] = { 0, 0 };
+	unsigned int pair_idx;
 
-	pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
-	pfn1 = 0;
-	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-		   (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
-	entrylo1 = 0;
+	pfn = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
+	pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
+	entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
+		(0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
 
 	local_irq_save(flags);
 
 	old_entryhi = read_c0_entryhi();
 	vaddr = badvaddr & (PAGE_MASK << 1);
 	write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
-	write_c0_entrylo0(entrylo0);
-	write_c0_entrylo1(entrylo1);
+	write_c0_entrylo0(entrylo[0]);
+	write_c0_entrylo1(entrylo[1]);
 	write_c0_index(kvm_mips_get_commpage_asid(vcpu));
 	mtc0_tlbw_hazard();
 	tlb_write_indexed();

From 7414d2f65006ac8609196092f2869e0942599b72 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:58 +0100
Subject: [PATCH 100/302] MIPS: KVM: Use host CCA for TLB mappings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM TLB mappings for the guest were being created with a cache coherency
attribute (CCA) of 3, which is cached incoherent. Create them instead
with the default host CCA, which should be the correct one for coherency
on SMP systems.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mmu.c | 18 ++++++++++--------
 arch/mips/kvm/tlb.c |  3 ++-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 2f494ec5c939c0..ecead748de049f 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -116,9 +116,11 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
 
 	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-		   (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
+		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+		ENTRYLO_D | ENTRYLO_V;
 	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
-		   (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
+		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+		ENTRYLO_D | ENTRYLO_V;
 
 	preempt_disable();
 	entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
@@ -157,13 +159,13 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 
 	/* Get attributes from the Guest TLB */
 	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-		   (0x3 << ENTRYLO_C_SHIFT) |
-		   (tlb->tlb_lo[0] & ENTRYLO_D) |
-		   (tlb->tlb_lo[0] & ENTRYLO_V);
+		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+		(tlb->tlb_lo[0] & ENTRYLO_D) |
+		(tlb->tlb_lo[0] & ENTRYLO_V);
 	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
-		   (0x3 << ENTRYLO_C_SHIFT) |
-		   (tlb->tlb_lo[1] & ENTRYLO_D) |
-		   (tlb->tlb_lo[1] & ENTRYLO_V);
+		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+		(tlb->tlb_lo[1] & ENTRYLO_D) |
+		(tlb->tlb_lo[1] & ENTRYLO_V);
 
 	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
 		  tlb->tlb_lo[0], tlb->tlb_lo[1]);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 385fbd34e77dee..9699352293e498 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -179,7 +179,8 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	pfn = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
 	pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
 	entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
-		(0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V;
+		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+		ENTRYLO_D | ENTRYLO_V;
 
 	local_irq_save(flags);
 

From 4b34bca0e4c7091a06d774342faf8c9a4836af22 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:29:59 +0100
Subject: [PATCH 101/302] MIPS: Add define for Config.VI (virtual icache) bit

The Config.VI bit specifies that the instruction cache is virtually
tagged, which is checked in c-r4k.c's probe_pcache(). Add a proper
definition for it in mipsregs.h and make use of it.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/mipsregs.h | 1 +
 arch/mips/mm/c-r4k.c             | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index 8b1b37d50d158f..def9d8d13f6ecb 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -533,6 +533,7 @@
 #define TX49_CONF_CWFON		(_ULCAST_(1) << 27)
 
 /* Bits specific to the MIPS32/64 PRA.	*/
+#define MIPS_CONF_VI		(_ULCAST_(1) <<  3)
 #define MIPS_CONF_MT		(_ULCAST_(7) <<	 7)
 #define MIPS_CONF_MT_TLB	(_ULCAST_(1) <<  7)
 #define MIPS_CONF_MT_FTLB	(_ULCAST_(4) <<  7)
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index ef7f925dd1b028..7a9c345e87e5d1 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1206,7 +1206,7 @@ static void probe_pcache(void)
 			      c->icache.linesz;
 		c->icache.waybit = __ffs(icache_size/c->icache.ways);
 
-		if (config & 0x8)		/* VI bit */
+		if (config & MIPS_CONF_VI)
 			c->icache.flags |= MIPS_CACHE_VTAG;
 
 		/*

From e342925f1777f73befda61b48845b0bc88a33181 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:30:00 +0100
Subject: [PATCH 102/302] MIPS: KVM: Report more accurate CP0_Config fields to
 guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initialise the guest's CP0_Config register with a few more bits of
information from the host. The BE bit should be set on big endian
machines, the VI bit should be set on machines with a virtually tagged
instruction cache, and the reported architecture revision should match
that of the host (since we won't support emulating pre-r6 instruction
encodings on r6 or vice versa).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/trap_emul.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index eb191c4612bb46..1dc003ddca913f 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -426,7 +426,7 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	u32 config1;
+	u32 config, config1;
 	int vcpu_id = vcpu->vcpu_id;
 
 	/*
@@ -434,10 +434,20 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	 * guest will come up as expected, for now we simulate a MIPS 24kc
 	 */
 	kvm_write_c0_guest_prid(cop0, 0x00019300);
-	/* Have config1, Cacheable, noncoherent, write-back, write allocate */
-	kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) |
-				  (0x1 << CP0C0_AR) |
-				  (MMU_TYPE_R4000 << CP0C0_MT));
+	/*
+	 * Have config1, Cacheable, noncoherent, write-back, write allocate.
+	 * Endianness, arch revision & virtually tagged icache should match
+	 * host.
+	 */
+	config = read_c0_config() & MIPS_CONF_AR;
+	config |= MIPS_CONF_M | (0x3 << CP0C0_K0) |
+		(MMU_TYPE_R4000 << CP0C0_MT);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	config |= CONF_BE;
+#endif
+	if (cpu_has_vtag_icache)
+		config |= MIPS_CONF_VI;
+	kvm_write_c0_guest_config(cop0, config);
 
 	/* Read the cache characteristics from the host Config1 Register */
 	config1 = (read_c0_config1() & ~0x7f);

From 4e10b764e2cba8d8eb5e22d9d8061806ec86805c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 15 Jun 2016 19:30:01 +0100
Subject: [PATCH 103/302] MIPS: KVM: Use mipsregs.h defs for config registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert MIPS KVM guest register state initialisation to use the standard
<asm/mipsregs.h> register field definitions for Config registers, and
drop the custom definitions in kvm_host.h which it was using before.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 67 --------------------------------
 arch/mips/kvm/trap_emul.c        |  8 ++--
 2 files changed, 3 insertions(+), 72 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 6c43c782bdfad1..b0773c6d622fa5 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -205,73 +205,6 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG4_SEL	4
 #define MIPS_CP0_CONFIG5_SEL	5
 
-/* Config0 register bits */
-#define CP0C0_M			31
-#define CP0C0_K23		28
-#define CP0C0_KU		25
-#define CP0C0_MDU		20
-#define CP0C0_MM		17
-#define CP0C0_BM		16
-#define CP0C0_BE		15
-#define CP0C0_AT		13
-#define CP0C0_AR		10
-#define CP0C0_MT		7
-#define CP0C0_VI		3
-#define CP0C0_K0		0
-
-/* Config1 register bits */
-#define CP0C1_M			31
-#define CP0C1_MMU		25
-#define CP0C1_IS		22
-#define CP0C1_IL		19
-#define CP0C1_IA		16
-#define CP0C1_DS		13
-#define CP0C1_DL		10
-#define CP0C1_DA		7
-#define CP0C1_C2		6
-#define CP0C1_MD		5
-#define CP0C1_PC		4
-#define CP0C1_WR		3
-#define CP0C1_CA		2
-#define CP0C1_EP		1
-#define CP0C1_FP		0
-
-/* Config2 Register bits */
-#define CP0C2_M			31
-#define CP0C2_TU		28
-#define CP0C2_TS		24
-#define CP0C2_TL		20
-#define CP0C2_TA		16
-#define CP0C2_SU		12
-#define CP0C2_SS		8
-#define CP0C2_SL		4
-#define CP0C2_SA		0
-
-/* Config3 Register bits */
-#define CP0C3_M			31
-#define CP0C3_ISA_ON_EXC	16
-#define CP0C3_ULRI		13
-#define CP0C3_DSPP		10
-#define CP0C3_LPA		7
-#define CP0C3_VEIC		6
-#define CP0C3_VInt		5
-#define CP0C3_SP		4
-#define CP0C3_MT		2
-#define CP0C3_SM		1
-#define CP0C3_TL		0
-
-/* MMU types, the first four entries have the same layout as the
-   CP0C0_MT field.  */
-enum mips_mmu_types {
-	MMU_TYPE_NONE,
-	MMU_TYPE_R4000,
-	MMU_TYPE_RESERVED,
-	MMU_TYPE_FMT,
-	MMU_TYPE_R3000,
-	MMU_TYPE_R6000,
-	MMU_TYPE_R8000
-};
-
 /* Resume Flags */
 #define RESUME_FLAG_DR		(1<<0)	/* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST	(1<<1)	/* Resume host? */
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 1dc003ddca913f..00e8dc3d36cb1c 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -440,8 +440,7 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	 * host.
 	 */
 	config = read_c0_config() & MIPS_CONF_AR;
-	config |= MIPS_CONF_M | (0x3 << CP0C0_K0) |
-		(MMU_TYPE_R4000 << CP0C0_MT);
+	config |= MIPS_CONF_M | CONF_CM_CACHABLE_NONCOHERENT | MIPS_CONF_MT_TLB;
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	config |= CONF_BE;
 #endif
@@ -457,9 +456,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);
 
 	/* We unset some bits that we aren't emulating */
-	config1 &=
-	    ~((1 << CP0C1_C2) | (1 << CP0C1_MD) | (1 << CP0C1_PC) |
-	      (1 << CP0C1_WR) | (1 << CP0C1_CA));
+	config1 &= ~(MIPS_CONF1_C2 | MIPS_CONF1_MD | MIPS_CONF1_PC |
+		     MIPS_CONF1_WR | MIPS_CONF1_CA);
 	kvm_write_c0_guest_config1(cop0, config1);
 
 	/* Have config3, no tertiary/secondary caches implemented */

From 682a8108872f78560c891cf30c7d08aa01dac943 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 9 May 2016 11:53:06 +0200
Subject: [PATCH 104/302] x86/kvm/svm: Simplify cpu_has_svm()

Use already cached CPUID information instead of querying CPUID again.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: kvm@vger.kernel.org
Cc: x86@kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/svm.h     | 1 -
 arch/x86/include/asm/virtext.h | 8 ++------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index d0fe23ec7e9877..14824fc78f7e71 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb {
 	struct vmcb_save_area save;
 };
 
-#define SVM_CPUID_FEATURE_SHIFT 2
 #define SVM_CPUID_FUNC 0x8000000a
 
 #define SVM_VM_CR_SVM_DISABLE 4
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index cce9ee68e335f8..0116b2ee9e64f3 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void)
  */
 static inline int cpu_has_svm(const char **msg)
 {
-	uint32_t eax, ebx, ecx, edx;
-
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
 		if (msg)
 			*msg = "not amd";
 		return 0;
 	}
 
-	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-	if (eax < SVM_CPUID_FUNC) {
+	if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
 		if (msg)
 			*msg = "can't execute cpuid_8000000a";
 		return 0;
 	}
 
-	cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+	if (!boot_cpu_has(X86_FEATURE_SVM)) {
 		if (msg)
 			*msg = "svm not available";
 		return 0;

From 6c7caebc26c5f0b618f0ef6b851e9f5f27c3812f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 13 Jun 2016 14:48:25 +0200
Subject: [PATCH 105/302] KVM: introduce kvm->created_vcpus

The race between creating the irqchip and the first VCPU is
currently fixed by checking the presence of an irqchip before
updating kvm->online_vcpus, and undoing the whole VCPU creation
if someone created the irqchip in the meanwhile.

Instead, introduce a new field in struct kvm that will count VCPUs
under a mutex, without the atomic access and memory ordering that we
need elsewhere to protect the vcpus array.  This also plugs the race
and is more easily applicable in all similar circumstances.

Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  8 ++++++++
 virt/kvm/kvm_main.c      | 23 +++++++++++++++++------
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c9c973a7dd9ae..63c6ab30bc8108 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -371,7 +371,15 @@ struct kvm {
 	struct srcu_struct srcu;
 	struct srcu_struct irq_srcu;
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+	/*
+	 * created_vcpus is protected by kvm->lock, and is incremented
+	 * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
+	 * incremented after storing the kvm_vcpu pointer in vcpus,
+	 * and is accessed atomically.
+	 */
 	atomic_t online_vcpus;
+	int created_vcpus;
 	int last_boosted_vcpu;
 	struct list_head vm_list;
 	struct mutex lock;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 02e98f3131bda1..15b757ae64e14e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2346,9 +2346,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	if (id >= KVM_MAX_VCPU_ID)
 		return -EINVAL;
 
+	mutex_lock(&kvm->lock);
+	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+		mutex_unlock(&kvm->lock);
+		return -EINVAL;
+	}
+
+	kvm->created_vcpus++;
+	mutex_unlock(&kvm->lock);
+
 	vcpu = kvm_arch_vcpu_create(kvm, id);
-	if (IS_ERR(vcpu))
-		return PTR_ERR(vcpu);
+	if (IS_ERR(vcpu)) {
+		r = PTR_ERR(vcpu);
+		goto vcpu_decrement;
+	}
 
 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 
@@ -2361,10 +2372,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		r = -EINVAL;
 		goto unlock_vcpu_destroy;
 	}
-	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
-		r = -EINVAL;
-		goto unlock_vcpu_destroy;
-	}
 	if (kvm_get_vcpu_by_id(kvm, id)) {
 		r = -EEXIST;
 		goto unlock_vcpu_destroy;
@@ -2397,6 +2404,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	mutex_unlock(&kvm->lock);
 vcpu_destroy:
 	kvm_arch_vcpu_destroy(vcpu);
+vcpu_decrement:
+	mutex_lock(&kvm->lock);
+	kvm->created_vcpus--;
+	mutex_unlock(&kvm->lock);
 	return r;
 }
 

From 557abc40d121358883d2da8bc8bf976d6e8ec332 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 13 Jun 2016 14:50:04 +0200
Subject: [PATCH 106/302] KVM: remove kvm_vcpu_compatible

The new created_vcpus field makes it possible to avoid the race between
irqchip and VCPU creation in a much nicer way; just check under kvm->lock
whether a VCPU has already been created.

We can then remove KVM_APIC_ARCHITECTURE too, because at this point the
symbol is only governing the default definition of kvm_vcpu_compatible.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/Kconfig     |  1 -
 arch/x86/kvm/x86.c       | 11 +++--------
 include/linux/kvm_host.h |  6 ------
 virt/kvm/Kconfig         |  3 ---
 virt/kvm/kvm_main.c      |  4 ----
 5 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 639a6e34500c10..ab8e32f7b9a868 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -32,7 +32,6 @@ config KVM
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
-	select KVM_APIC_ARCHITECTURE
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf227212aebb15..ab2f45a50bb5b8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3774,7 +3774,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		r = -EEXIST;
 		if (irqchip_in_kernel(kvm))
 			goto split_irqchip_unlock;
-		if (atomic_read(&kvm->online_vcpus))
+		if (kvm->created_vcpus)
 			goto split_irqchip_unlock;
 		r = kvm_setup_empty_irq_routing(kvm);
 		if (r)
@@ -3839,7 +3839,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (kvm->arch.vpic)
 			goto create_irqchip_unlock;
 		r = -EINVAL;
-		if (atomic_read(&kvm->online_vcpus))
+		if (kvm->created_vcpus)
 			goto create_irqchip_unlock;
 		r = -ENOMEM;
 		vpic = kvm_create_pic(kvm);
@@ -3995,7 +3995,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_SET_BOOT_CPU_ID:
 		r = 0;
 		mutex_lock(&kvm->lock);
-		if (atomic_read(&kvm->online_vcpus) != 0)
+		if (kvm->created_vcpus)
 			r = -EBUSY;
 		else
 			kvm->arch.bsp_vcpu_id = arg;
@@ -7639,11 +7639,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
 }
 
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
-	return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
-}
-
 struct static_key kvm_no_apic_vcpu __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 63c6ab30bc8108..0640ee92b97872 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1105,12 +1105,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
-#else
-static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
-#endif
-
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 {
 	/*
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index e5d6108f5e8596..b0cc1a34db27a4 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
 
-config KVM_APIC_ARCHITECTURE
-       bool
-
 config KVM_MMIO
        bool
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 15b757ae64e14e..ef54b4c3179262 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2368,10 +2368,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		goto vcpu_destroy;
 
 	mutex_lock(&kvm->lock);
-	if (!kvm_vcpu_compatible(vcpu)) {
-		r = -EINVAL;
-		goto unlock_vcpu_destroy;
-	}
 	if (kvm_get_vcpu_by_id(kvm, id)) {
 		r = -EEXIST;
 		goto unlock_vcpu_destroy;

From a03825bbd0c39feeba605912cdbc28e79e4e01e1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 13 Jun 2016 14:50:04 +0200
Subject: [PATCH 107/302] KVM: s390: use kvm->created_vcpus

The new created_vcpus field avoids possible races between enabling
capabilities and creating VCPUs.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 49c60393a15ce1..0dcf9b8fc12c2e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -422,7 +422,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		break;
 	case KVM_CAP_S390_VECTOR_REGISTERS:
 		mutex_lock(&kvm->lock);
-		if (atomic_read(&kvm->online_vcpus)) {
+		if (kvm->created_vcpus) {
 			r = -EBUSY;
 		} else if (MACHINE_HAS_VX) {
 			set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@ -437,7 +437,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 	case KVM_CAP_S390_RI:
 		r = -EINVAL;
 		mutex_lock(&kvm->lock);
-		if (atomic_read(&kvm->online_vcpus)) {
+		if (kvm->created_vcpus) {
 			r = -EBUSY;
 		} else if (test_facility(64)) {
 			set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@ -492,7 +492,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		ret = -EBUSY;
 		VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
 		mutex_lock(&kvm->lock);
-		if (atomic_read(&kvm->online_vcpus) == 0) {
+		if (!kvm->created_vcpus) {
 			kvm->arch.use_cmma = 1;
 			ret = 0;
 		}
@@ -536,7 +536,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 
 		ret = -EBUSY;
 		mutex_lock(&kvm->lock);
-		if (atomic_read(&kvm->online_vcpus) == 0) {
+		if (!kvm->created_vcpus) {
 			/* gmap_alloc will round the limit up */
 			struct gmap *new = gmap_alloc(current->mm, new_limit);
 
@@ -713,7 +713,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 	int ret = 0;
 
 	mutex_lock(&kvm->lock);
-	if (atomic_read(&kvm->online_vcpus)) {
+	if (kvm->created_vcpus) {
 		ret = -EBUSY;
 		goto out;
 	}

From 53f9eedff713bab262b64682ad1abb1e8116d041 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Date: Mon, 13 Jun 2016 14:20:00 -0700
Subject: [PATCH 108/302] kvm: lapic: separate start_sw_tscdeadline from
 start_apic_timer

The function to start the tsc deadline timer virtualization will be used
also by the pre_block hook when we use the preemption timer; change it
to a separate function. No logic changes.

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 57 ++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bbb5b283ff63a9..f1cf8a5ede114c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1313,6 +1313,36 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 		__delay(tsc_deadline - guest_tsc);
 }
 
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+	u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+	u64 ns = 0;
+	ktime_t expire;
+	struct kvm_vcpu *vcpu = apic->vcpu;
+	unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+	unsigned long flags;
+	ktime_t now;
+
+	if (unlikely(!tscdeadline || !this_tsc_khz))
+		return;
+
+	local_irq_save(flags);
+
+	now = apic->lapic_timer.timer.base->get_time();
+	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+	if (likely(tscdeadline > guest_tsc)) {
+		ns = (tscdeadline - guest_tsc) * 1000000ULL;
+		do_div(ns, this_tsc_khz);
+		expire = ktime_add_ns(now, ns);
+		expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+		hrtimer_start(&apic->lapic_timer.timer,
+				expire, HRTIMER_MODE_ABS_PINNED);
+	} else
+		apic_timer_expired(apic);
+
+	local_irq_restore(flags);
+}
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
@@ -1359,32 +1389,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		/* lapic timer in tsc deadline mode */
-		u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
-		u64 ns = 0;
-		ktime_t expire;
-		struct kvm_vcpu *vcpu = apic->vcpu;
-		unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
-		unsigned long flags;
-
-		if (unlikely(!tscdeadline || !this_tsc_khz))
-			return;
-
-		local_irq_save(flags);
-
-		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-		if (likely(tscdeadline > guest_tsc)) {
-			ns = (tscdeadline - guest_tsc) * 1000000ULL;
-			do_div(ns, this_tsc_khz);
-			expire = ktime_add_ns(now, ns);
-			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-			hrtimer_start(&apic->lapic_timer.timer,
-				      expire, HRTIMER_MODE_ABS_PINNED);
-		} else
-			apic_timer_expired(apic);
-
-		local_irq_restore(flags);
+		start_sw_tscdeadline(apic);
 	}
 }
 

From ce7a058a2117f0bca2f42f2870a97bfa9aa8e099 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Date: Mon, 13 Jun 2016 14:20:01 -0700
Subject: [PATCH 109/302] KVM: x86: support using the vmx preemption timer for
 tsc deadline timer

The VMX preemption timer can be used to virtualize the TSC deadline timer.
The VMX preemption timer is armed when the vCPU is running, and a VMExit
will happen if the virtual TSC deadline timer expires.

When the vCPU thread is blocked because of HLT, KVM will switch to use
an hrtimer, and then go back to the VMX preemption timer when the vCPU
thread is unblocked.

This solution avoids the complex OS's hrtimer system, and the host
timer interrupt handling cost, replacing them with a little math
(for guest->host TSC and host TSC->preemption timer conversion)
and a cheaper VMexit.  This benefits latency for isolated pCPUs.

[A word about performance... Yunhong reported a 30% reduction in average
 latency from cyclictest.  I made a similar test with tscdeadline_latency
 from kvm-unit-tests, and measured

 - ~20 clock cycles loss (out of ~3200, so less than 1% but still
   statistically significant) in the worst case where the test halts
   just after programming the TSC deadline timer

 - ~800 clock cycles gain (25% reduction in latency) in the best case
   where the test busy waits.

 I removed the VMX bits from Yunhong's patch, to concentrate them in the
 next patch - Paolo]

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++
 arch/x86/kvm/lapic.c            | 73 ++++++++++++++++++++++++++++++++-
 arch/x86/kvm/lapic.h            |  5 +++
 arch/x86/kvm/trace.h            | 15 +++++++
 arch/x86/kvm/x86.c              |  5 +++
 5 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e0fbe7e70dc193..e055f3787dc9cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1005,6 +1005,9 @@ struct kvm_x86_ops {
 	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
 			      uint32_t guest_irq, bool set);
 	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+
+	int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+	void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f1cf8a5ede114c..fdc05ae08bac94 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1343,6 +1343,68 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+	WARN_ON(swait_active(&vcpu->wq));
+	kvm_x86_ops->cancel_hv_timer(vcpu);
+	apic->lapic_timer.hv_timer_in_use = false;
+	apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	WARN_ON(apic->lapic_timer.hv_timer_in_use);
+
+	if (apic_lvtt_tscdeadline(apic) &&
+	    !atomic_read(&apic->lapic_timer.pending)) {
+		u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+		if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) {
+			apic->lapic_timer.hv_timer_in_use = true;
+			hrtimer_cancel(&apic->lapic_timer.timer);
+
+			/* In case the sw timer triggered in the window */
+			if (atomic_read(&apic->lapic_timer.pending)) {
+				apic->lapic_timer.hv_timer_in_use = false;
+				kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+			}
+		}
+		trace_kvm_hv_timer_state(vcpu->vcpu_id,
+				apic->lapic_timer.hv_timer_in_use);
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	/* Possibly the TSC deadline timer is not enabled yet */
+	if (!apic->lapic_timer.hv_timer_in_use)
+		return;
+
+	kvm_x86_ops->cancel_hv_timer(vcpu);
+	apic->lapic_timer.hv_timer_in_use = false;
+
+	if (atomic_read(&apic->lapic_timer.pending))
+		return;
+
+	start_sw_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
@@ -1389,7 +1451,16 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		start_sw_tscdeadline(apic);
+		/* lapic timer in tsc deadline mode */
+		u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+		if (kvm_x86_ops->set_hv_timer &&
+		    !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+			apic->lapic_timer.hv_timer_in_use = true;
+			trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+					apic->lapic_timer.hv_timer_in_use);
+		} else
+			start_sw_tscdeadline(apic);
 	}
 }
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 891c6da7d4aa98..336ba51bb16ecc 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -20,6 +20,7 @@ struct kvm_timer {
 	u64 tscdeadline;
 	u64 expired_tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
+	bool hv_timer_in_use;
 };
 
 struct kvm_lapic {
@@ -212,4 +213,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
 int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
 			const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 8de925031b5cb4..0a6cc6754ec5a8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
 		  __entry->vec)
 );
 
+TRACE_EVENT(kvm_hv_timer_state,
+		TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+		TP_ARGS(vcpu_id, hv_timer_in_use),
+		TP_STRUCT__entry(
+			__field(unsigned int, vcpu_id)
+			__field(unsigned int, hv_timer_in_use)
+			),
+		TP_fast_assign(
+			__entry->vcpu_id = vcpu_id;
+			__entry->hv_timer_in_use = hv_timer_in_use;
+			),
+		TP_printk("vcpu_id %x hv_timer %x\n",
+			__entry->vcpu_id,
+			__entry->hv_timer_in_use)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ab2f45a50bb5b8..1f4b2926a5a32d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2740,6 +2740,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
+
+		if (kvm_lapic_hv_timer_in_use(vcpu) &&
+				kvm_x86_ops->set_hv_timer(vcpu,
+					kvm_get_lapic_tscdeadline_msr(vcpu)))
+			kvm_lapic_switch_to_sw_timer(vcpu);
 		if (check_tsc_unstable()) {
 			u64 offset = kvm_compute_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);

From bc22512bb24c480fae8ae96b233378ef96007590 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Date: Mon, 13 Jun 2016 14:19:58 -0700
Subject: [PATCH 110/302] kvm: vmx: rename vmx_pre/post_block to
 pi_pre/post_block

Prepare to switch from preemption timer to hrtimer in the
vmx_pre/post_block. Current functions are only for posted interrupt,
rename them accordingly.

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 57ec6a4b49581e..29e78fdd8ab885 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10706,7 +10706,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
  *   this case, return 1, otherwise, return 0.
  *
  */
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
 	unsigned int dest;
@@ -10772,7 +10772,15 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+	if (pi_pre_block(vcpu))
+		return 1;
+
+	return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 	struct pi_desc old, new;
@@ -10813,6 +10821,11 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+	pi_post_block(vcpu);
+}
+
 /*
  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
  *

From 64672c95ea4c2f7096e519e826076867e8ef0938 Mon Sep 17 00:00:00 2001
From: Yunhong Jiang <yunhong.jiang@intel.com>
Date: Mon, 13 Jun 2016 14:19:59 -0700
Subject: [PATCH 111/302] kvm: vmx: hook preemption timer support

Hook the VMX preemption timer to the "hv timer" functionality added
by the previous patch.  This includes: checking if the feature is
supported, if the feature is broken on the CPU, the hooks to
setup/clean the VMX preemption timer, arming the timer on vmentry
and handling the vmexit.

A module parameter states if the VMX preemption timer should be
utilized.

Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
[Move hv_deadline_tsc to struct vcpu_vmx, use -1 as the "unset" value.
 Put all VMX bits here.  Enable it by default #yolo. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/vmx.c              | 180 +++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |   3 +-
 3 files changed, 183 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e055f3787dc9cf..360c5171ea1a89 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1079,6 +1079,8 @@ extern u32  kvm_max_guest_tsc_khz;
 extern u8   kvm_tsc_scaling_ratio_frac_bits;
 /* maximum allowed value of TSC scaling ratio */
 extern u64  kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64  kvm_default_tsc_scaling_ratio;
 
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29e78fdd8ab885..e185649fb8b72c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
+/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON						\
@@ -597,6 +604,9 @@ struct vcpu_vmx {
 #define PML_ENTITY_NUM		512
 	struct page *pml_pg;
 
+	/* apic deadline value in host tsc */
+	u64 hv_deadline_tsc;
+
 	u64 current_tsc_ratio;
 
 	bool guest_pkru_valid;
@@ -1056,6 +1066,61 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86  - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+	u32 eax = cpuid_eax(0x00000001), i;
+
+	/* Clear the reserved bits */
+	eax &= ~(0x3U << 14 | 0xfU << 28);
+	for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++)
+		if (eax == vmx_preemption_cpu_tfms[i])
+			return true;
+
+	return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+	if (cpu_has_broken_vmx_preemption_timer())
+		return false;
+
+	return vmcs_config.pin_based_exec_ctrl &
+		PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
 static inline bool cpu_has_vmx_posted_intr(void)
 {
 	return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -3308,7 +3373,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		return -EIO;
 
 	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+		 PIN_BASED_VMX_PREEMPTION_TIMER;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
 				&_pin_based_exec_control) < 0)
 		return -EIO;
@@ -4781,6 +4847,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 
 	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
 		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+	/* Enable the preemption timer dynamically */
+	pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 	return pin_based_exec_ctrl;
 }
 
@@ -4899,6 +4967,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	/* Control */
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+	vmx->hv_deadline_tsc = -1;
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -6389,6 +6458,17 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 	}
 
+	if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+		u64 vmx_msr;
+
+		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+		cpu_preemption_timer_multi =
+			 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+	} else {
+		kvm_x86_ops->set_hv_timer = NULL;
+		kvm_x86_ops->cancel_hv_timer = NULL;
+	}
+
 	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
 	return alloc_kvm_area();
@@ -7564,6 +7644,12 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+	kvm_lapic_expired_hv_timer(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7615,6 +7701,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -8623,6 +8710,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 					msrs[i].host);
 }
 
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 tscl;
+	u32 delta_tsc;
+
+	if (vmx->hv_deadline_tsc == -1)
+		return;
+
+	tscl = rdtsc();
+	if (vmx->hv_deadline_tsc > tscl)
+		/* sure to be 32 bit only because checked on set_hv_timer */
+		delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+			cpu_preemption_timer_multi);
+	else
+		delta_tsc = 0;
+
+	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8672,6 +8779,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	atomic_switch_perf_msrs(vmx);
 	debugctlmsr = get_debugctlmsr();
 
+	vmx_arm_hv_timer(vcpu);
+
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
@@ -10662,6 +10771,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 	return X86EMUL_CONTINUE;
 }
 
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+				  u64 divisor, u64 *result)
+{
+	u64 low = a << shift, high = a >> (64 - shift);
+
+	/* To avoid the overflow on divq */
+	if (high >= divisor)
+		return 1;
+
+	/* Low hold the result, high hold rem which is discarded */
+	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+	    "rm" (divisor), "0" (low), "1" (high));
+	*result = low;
+
+	return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 tscl = rdtsc(), delta_tsc;
+
+	delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl);
+
+	/* Convert to host delta tsc if tsc scaling is enabled */
+	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+			u64_shl_div_u64(delta_tsc,
+				kvm_tsc_scaling_ratio_frac_bits,
+				vcpu->arch.tsc_scaling_ratio,
+				&delta_tsc))
+		return -ERANGE;
+
+	/*
+	 * If the delta tsc can't fit in the 32 bit after the multi shift,
+	 * we can't use the preemption timer.
+	 * It's possible that it fits on later vmentries, but checking
+	 * on every vmentry is costly so we just use an hrtimer.
+	 */
+	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+		return -ERANGE;
+
+	vmx->hv_deadline_tsc = tscl + delta_tsc;
+	vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+			PIN_BASED_VMX_PREEMPTION_TIMER);
+	return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	vmx->hv_deadline_tsc = -1;
+	vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+			PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
 	if (ple_gap)
@@ -10777,6 +10944,9 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 	if (pi_pre_block(vcpu))
 		return 1;
 
+	if (kvm_lapic_hv_timer_in_use(vcpu))
+		kvm_lapic_switch_to_sw_timer(vcpu);
+
 	return 0;
 }
 
@@ -10823,6 +10993,9 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
 {
+	if (kvm_x86_ops->set_hv_timer)
+		kvm_lapic_switch_to_hv_timer(vcpu);
+
 	pi_post_block(vcpu);
 }
 
@@ -11038,6 +11211,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.pmu_ops = &intel_pmu_ops,
 
 	.update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+	.set_hv_timer = vmx_set_hv_timer,
+	.cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f4b2926a5a32d..299219630c9470 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -114,7 +114,8 @@ u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 u64  __read_mostly kvm_max_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;

From 708e75a3ee750dce1072134e630d66c4e6eaf63c Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 18 May 2016 21:01:20 +0200
Subject: [PATCH 112/302] KVM: PPC: Book3S PR: Fix illegal opcode emulation

If kvmppc_handle_exit_pr() calls kvmppc_emulate_instruction() to emulate
one instruction (in the BOOK3S_INTERRUPT_H_EMUL_ASSIST case), it calls
kvmppc_core_queue_program() afterwards if kvmppc_emulate_instruction()
returned EMULATE_FAIL, so the guest gets an program interrupt for the
illegal opcode.
However, the kvmppc_emulate_instruction() also tried to inject a
program exception for this already, so the program interrupt gets
injected twice and the return address in srr0 gets destroyed.
All other callers of kvmppc_emulate_instruction() are also injecting
a program interrupt, and since the callers have the right knowledge
about the srr1 flags that should be used, it is the function
kvmppc_emulate_instruction() that should _not_ inject program
interrupts, so remove the kvmppc_core_queue_program() here.

This fixes the issue discovered by Laurent Vivier with kvm-unit-tests
where the logs are filled with these messages when the test tries
to execute an illegal instruction:

     Couldn't emulate instruction 0x00000000 (op 0 xop 0)
     kvmppc_handle_exit_pr: emulation at 700 failed (00000000)

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Alexander Graf <agraf@suse.de>
Tested-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/emulate.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 5cc2e7af3a7b96..b379146de55bf1 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			advance = 0;
 			printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
 			       "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
-			kvmppc_core_queue_program(vcpu, 0);
 		}
 	}
 

From b69890d18fa33a53cec6ae5c93555ee0c24fe0a9 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Thu, 19 May 2016 11:33:31 +0200
Subject: [PATCH 113/302] KVM: PPC: Book3S PR: Fix contents of SRR1 when
 injecting a program exception

vcpu->arch.shadow_srr1 only contains usable values for injecting
a program exception into the guest if we entered the function
kvmppc_handle_exit_pr() with exit_nr == BOOK3S_INTERRUPT_PROGRAM.
In other cases, the shadow_srr1 bits are zero. Since we want to
pass an illegal-instruction program check to the guest, set
"flags" to SRR1_PROGILL for these other cases.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_pr.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 8e4f64f0b7741d..a910fef86bbacb 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		int emul;
 
 program_interrupt:
-		flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+		/*
+		 * shadow_srr1 only contains valid flags if we came here via
+		 * a program exception. The other exceptions (emulation assist,
+		 * FP unavailable, etc.) do not provide flags in SRR1, so use
+		 * an illegal-instruction exception when injecting a program
+		 * interrupt into the guest.
+		 */
+		if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+			flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+		else
+			flags = SRR1_PROGILL;
 
 		emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
 		if (emul != EMULATE_DONE) {

From 6dd06d15a86e8fca21ed4fb568bed2b3da7a7907 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Sun, 15 May 2016 09:44:13 +0530
Subject: [PATCH 114/302] powerpc/powernv: Remove the usage of PACAR1 from opal
 wrappers

OPAL_CALL wrapper code sticks the r1 (stack pointer) into PACAR1 purely
for debugging purpose only. The power7_wakeup* functions relies on stack
pointer saved in PACAR1. Any opal call made using opal wrapper (directly
or in-directly) before we fall through power7_wakeup*, then it ends up
replacing r1 in PACAR1(r13) leading to kernel panic. So far we don't see
any issues because we have never made any opal calls using OPAL wrapper
before power7_wakeup*. But the subsequent HMI patch would need to invoke
C calls during cpu wakeup/idle path that in-directly makes opal call using
opal wrapper. This patch facilitates the subsequent HMI patch by removing
usage of PACAR1 from opal call wrapper.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/platforms/powernv/opal-wrappers.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index e45b88a5d7e0f7..df6ad949e35f6d 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1);						\
 	OPAL_BRANCH(opal_tracepoint_entry) \
 	mfcr	r12;			\
 	stw	r12,8(r1);		\
-	std	r1,PACAR1(r13);		\
 	li	r11,0;			\
 	mfmsr	r12;			\
 	ori	r11,r11,MSR_EE;		\
@@ -127,7 +126,6 @@ opal_tracepoint_entry:
 	mfcr	r12
 	std	r11,16(r1)
 	stw	r12,8(r1)
-	std	r1,PACAR1(r13)
 	li	r11,0
 	mfmsr	r12
 	ori	r11,r11,MSR_EE

From fd7bacbca47a86a6f418440d8a5d7b7edbb2f8f9 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Sun, 15 May 2016 09:44:26 +0530
Subject: [PATCH 115/302] KVM: PPC: Book3S HV: Fix TB corruption in guest exit
 path on HMI interrupt

When a guest is assigned to a core it converts the host Timebase (TB)
into guest TB by adding guest timebase offset before entering into
guest. During guest exit it restores the guest TB to host TB. This means
under certain conditions (Guest migration) host TB and guest TB can differ.

When we get an HMI for TB related issues the opal HMI handler would
try fixing errors and restore the correct host TB value. With no guest
running, we don't have any issues. But with guest running on the core
we run into TB corruption issues.

If we get an HMI while in the guest, the current HMI handler invokes opal
hmi handler before forcing guest to exit. The guest exit path subtracts
the guest TB offset from the current TB value which may have already
been restored with host value by opal hmi handler. This leads to incorrect
host and guest TB values.

With split-core, things become more complex. With split-core, TB also gets
split and each subcore gets its own TB register. When a hmi handler fixes
a TB error and restores the TB value, it affects all the TB values of
sibling subcores on the same core. On TB errors all the thread in the core
gets HMI. With existing code, the individual threads call opal hmi handle
independently which can easily throw TB out of sync if we have guest
running on subcores. Hence we will need to co-ordinate with all the
threads before making opal hmi handler call followed by TB resync.

This patch introduces a sibling subcore state structure (shared by all
threads in the core) in paca which holds information about whether sibling
subcores are in Guest mode or host mode. An array in_guest[] of size
MAX_SUBCORE_PER_CORE=4 is used to maintain the state of each subcore.
The subcore id is used as index into in_guest[] array. Only primary
thread entering/exiting the guest is responsible to set/unset its
designated array element.

On TB error, we get HMI interrupt on every thread on the core. Upon HMI,
this patch will now force guest to vacate the core/subcore. Primary
thread from each subcore will then turn off its respective bit
from the above bitmap during the guest exit path just after the
guest->host partition switch is complete.

All other threads that have just exited the guest OR were already in host
will wait until all other subcores clears their respective bit.
Once all the subcores turn off their respective bit, all threads will
will make call to opal hmi handler.

It is not necessary that opal hmi handler would resync the TB value for
every HMI interrupts. It would do so only for the HMI caused due to
TB errors. For rest, it would not touch TB value. Hence to make things
simpler, primary thread would call TB resync explicitly once for each
core immediately after opal hmi handler instead of subtracting guest
offset from TB. TB resync call will restore the TB with host value.
Thus we can be sure about the TB state.

One of the primary threads exiting the guest will take up the
responsibility of calling TB resync. It will use one of the top bits
(bit 63) from subcore state flags bitmap to make the decision. The first
primary thread (among the subcores) that is able to set the bit will
have to call the TB resync. Rest all other threads will wait until TB
resync is complete.  Once TB resync is complete all threads will then
proceed.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/hmi.h          |  45 ++++++
 arch/powerpc/include/asm/paca.h         |   6 +
 arch/powerpc/kernel/Makefile            |   2 +-
 arch/powerpc/kernel/exceptions-64s.S    |   4 +-
 arch/powerpc/kernel/hmi.c               |  56 ++++++++
 arch/powerpc/kernel/idle_power7.S       |   5 +-
 arch/powerpc/kernel/traps.c             |   5 +
 arch/powerpc/kvm/book3s_hv.c            |  37 +++++
 arch/powerpc/kvm/book3s_hv_ras.c        | 176 ++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  65 ++++++++-
 10 files changed, 396 insertions(+), 5 deletions(-)
 create mode 100644 arch/powerpc/include/asm/hmi.h
 create mode 100644 arch/powerpc/kernel/hmi.c

diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
new file mode 100644
index 00000000000000..88b4901ac4eef4
--- /dev/null
+++ b/arch/powerpc/include/asm/hmi.h
@@ -0,0 +1,45 @@
+/*
+ * Hypervisor Maintenance Interrupt header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __ASM_PPC64_HMI_H__
+#define __ASM_PPC64_HMI_H__
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+#define	CORE_TB_RESYNC_REQ_BIT		63
+#define MAX_SUBCORE_PER_CORE		4
+
+/*
+ * sibling_subcore_state structure is used to co-ordinate all threads
+ * during HMI to avoid TB corruption. This structure is allocated once
+ * per each core and shared by all threads on that core.
+ */
+struct sibling_subcore_state {
+	unsigned long	flags;
+	u8		in_guest[MAX_SUBCORE_PER_CORE];
+};
+
+extern void wait_for_subcore_guest_exit(void);
+extern void wait_for_tb_resync(void);
+#else
+static inline void wait_for_subcore_guest_exit(void) { }
+static inline void wait_for_tb_resync(void) { }
+#endif
+#endif /* __ASM_PPC64_HMI_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 546540b9109595..4b17bd058e01f0 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -25,6 +25,7 @@
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 #include <asm/kvm_book3s_asm.h>
 #endif
+#include <asm/hmi.h>
 
 register struct paca_struct *local_paca asm("r13");
 
@@ -181,6 +182,11 @@ struct paca_struct {
 	 */
 	u16 in_mce;
 	u8 hmi_event_available;		 /* HMI event is available */
+	/*
+	 * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+	 * more details
+	 */
+	struct sibling_subcore_state *sibling_subcore_state;
 #endif
 
 	/* Stuff for accurate time accounting */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 2da380fcc34c69..6972a23433d349 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o hmi.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC64)		+= vdso64/
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 4c9440629128cc..0eba47e074b94c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -680,6 +680,8 @@ _GLOBAL(__replay_interrupt)
 BEGIN_FTR_SECTION
 	cmpwi	r3,0xe80
 	beq	h_doorbell_common
+	cmpwi	r3,0xe60
+	beq	hmi_exception_common
 FTR_SECTION_ELSE
 	cmpwi	r3,0xa00
 	beq	doorbell_super_common
@@ -1172,7 +1174,7 @@ fwnmi_data_area:
 
 	.globl hmi_exception_early
 hmi_exception_early:
-	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
 	mr	r10,r1			/* Save r1			*/
 	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c
new file mode 100644
index 00000000000000..e3f738eb1cacc7
--- /dev/null
+++ b/arch/powerpc/kernel/hmi.c
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+	int i;
+
+	/*
+	 * NULL bitmap pointer indicates that KVM module hasn't
+	 * been loaded yet and hence no guests are running.
+	 * If no KVM is in use, no need to co-ordinate among threads
+	 * as all of them will always be in host and no one is going
+	 * to modify TB other than the opal hmi handler.
+	 * Hence, just return from here.
+	 */
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+		while (local_paca->sibling_subcore_state->in_guest[i])
+			cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+				&local_paca->sibling_subcore_state->flags))
+		cpu_relax();
+}
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index 470ceebd2d237c..bb5112908fb29a 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -270,8 +270,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
 	ld	r2,PACATOC(r13);					\
 	ld	r1,PACAR1(r13);						\
 	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
-	li	r0,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
-	bl	opal_call_realmode;					\
+	li	r3,0;			/* NULL argument */		\
+	bl	hmi_exception_realmode;					\
+	nop;								\
 	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
 20:	nop;
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9229ba63c37086..9ec95daccad92a 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -60,6 +60,7 @@
 #include <asm/switch_to.h>
 #include <asm/tm.h>
 #include <asm/debug.h>
+#include <asm/hmi.h>
 #include <sysdev/fsl_pci.h>
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@@ -307,9 +308,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
 {
 	__this_cpu_inc(irq_stat.hmi_exceptions);
 
+	wait_for_subcore_guest_exit();
+
 	if (ppc_md.hmi_exception_early)
 		ppc_md.hmi_exception_early(regs);
 
+	wait_for_tb_resync();
+
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e20beae5ca7a46..bc27c4d6383c71 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -52,6 +52,7 @@
 #include <asm/switch_to.h>
 #include <asm/smp.h>
 #include <asm/dbell.h>
+#include <asm/hmi.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.hcall_implemented = kvmppc_hcall_impl_hv,
 };
 
+static int kvm_init_subcore_bitmap(void)
+{
+	int i, j;
+	int nr_cores = cpu_nr_cores();
+	struct sibling_subcore_state *sibling_subcore_state;
+
+	for (i = 0; i < nr_cores; i++) {
+		int first_cpu = i * threads_per_core;
+		int node = cpu_to_node(first_cpu);
+
+		/* Ignore if it is already allocated. */
+		if (paca[first_cpu].sibling_subcore_state)
+			continue;
+
+		sibling_subcore_state =
+			kmalloc_node(sizeof(struct sibling_subcore_state),
+							GFP_KERNEL, node);
+		if (!sibling_subcore_state)
+			return -ENOMEM;
+
+		memset(sibling_subcore_state, 0,
+				sizeof(struct sibling_subcore_state));
+
+		for (j = 0; j < threads_per_core; j++) {
+			int cpu = first_cpu + j;
+
+			paca[cpu].sibling_subcore_state = sibling_subcore_state;
+		}
+	}
+	return 0;
+}
+
 static int kvmppc_book3s_init_hv(void)
 {
 	int r;
@@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void)
 	if (r < 0)
 		return -ENODEV;
 
+	r = kvm_init_subcore_bitmap();
+	if (r)
+		return r;
+
 	kvm_ops_hv.owner = THIS_MODULE;
 	kvmppc_hv_ops = &kvm_ops_hv;
 
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 93b5f5c9b4455e..0fa70a9618d7ad 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -13,6 +13,9 @@
 #include <linux/kernel.h>
 #include <asm/opal.h>
 #include <asm/mce.h>
+#include <asm/machdep.h>
+#include <asm/cputhreads.h>
+#include <asm/hmi.h>
 
 /* SRR1 bits for machine check on POWER7 */
 #define SRR1_MC_LDSTERR		(1ul << (63-42))
@@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
 {
 	return kvmppc_realmode_mc_power7(vcpu);
 }
+
+/* Check if dynamic split is in force and return subcore size accordingly. */
+static inline int kvmppc_cur_subcore_size(void)
+{
+	if (local_paca->kvm_hstate.kvm_split_mode)
+		return local_paca->kvm_hstate.kvm_split_mode->subcore_size;
+
+	return threads_per_subcore;
+}
+
+void kvmppc_subcore_enter_guest(void)
+{
+	int thread_id, subcore_id;
+
+	thread_id = cpu_thread_in_core(local_paca->paca_index);
+	subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+	local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
+}
+
+void kvmppc_subcore_exit_guest(void)
+{
+	int thread_id, subcore_id;
+
+	thread_id = cpu_thread_in_core(local_paca->paca_index);
+	subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+	local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
+}
+
+static bool kvmppc_tb_resync_required(void)
+{
+	if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT,
+				&local_paca->sibling_subcore_state->flags))
+		return false;
+
+	return true;
+}
+
+static void kvmppc_tb_resync_done(void)
+{
+	clear_bit(CORE_TB_RESYNC_REQ_BIT,
+			&local_paca->sibling_subcore_state->flags);
+}
+
+/*
+ * kvmppc_realmode_hmi_handler() is called only by primary thread during
+ * guest exit path.
+ *
+ * There are multiple reasons why HMI could occur, one of them is
+ * Timebase (TB) error. If this HMI is due to TB error, then TB would
+ * have been in stopped state. The opal hmi handler Will fix it and
+ * restore the TB value with host timebase value. For HMI caused due
+ * to non-TB errors, opal hmi handler will not touch/restore TB register
+ * and hence there won't be any change in TB value.
+ *
+ * Since we are not sure about the cause of this HMI, we can't be sure
+ * about the content of TB register whether it holds guest or host timebase
+ * value. Hence the idea is to resync the TB on every HMI, so that we
+ * know about the exact state of the TB value. Resync TB call will
+ * restore TB to host timebase.
+ *
+ * Things to consider:
+ * - On TB error, HMI interrupt is reported on all the threads of the core
+ *   that has encountered TB error irrespective of split-core mode.
+ * - The very first thread on the core that get chance to fix TB error
+ *   would rsync the TB with local chipTOD value.
+ * - The resync TB is a core level action i.e. it will sync all the TBs
+ *   in that core independent of split-core mode. This means if we trigger
+ *   TB sync from a thread from one subcore, it would affect TB values of
+ *   sibling subcores of the same core.
+ *
+ * All threads need to co-ordinate before making opal hmi handler.
+ * All threads will use sibling_subcore_state->in_guest[] (shared by all
+ * threads in the core) in paca which holds information about whether
+ * sibling subcores are in Guest mode or host mode. The in_guest[] array
+ * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset
+ * subcore status. Only primary threads from each subcore is responsible
+ * to set/unset its designated array element while entering/exiting the
+ * guset.
+ *
+ * After invoking opal hmi handler call, one of the thread (of entire core)
+ * will need to resync the TB. Bit 63 from subcore state bitmap flags
+ * (sibling_subcore_state->flags) will be used to co-ordinate between
+ * primary threads to decide who takes up the responsibility.
+ *
+ * This is what we do:
+ * - Primary thread from each subcore tries to set resync required bit[63]
+ *   of paca->sibling_subcore_state->flags.
+ * - The first primary thread that is able to set the flag takes the
+ *   responsibility of TB resync. (Let us call it as thread leader)
+ * - All other threads which are in host will call
+ *   wait_for_subcore_guest_exit() and wait for in_guest[0-3] from
+ *   paca->sibling_subcore_state to get cleared.
+ * - All the primary thread will clear its subcore status from subcore
+ *   state in_guest[] array respectively.
+ * - Once all primary threads clear in_guest[0-3], all of them will invoke
+ *   opal hmi handler.
+ * - Now all threads will wait for TB resync to complete by invoking
+ *   wait_for_tb_resync() except the thread leader.
+ * - Thread leader will do a TB resync by invoking opal_resync_timebase()
+ *   call and the it will clear the resync required bit.
+ * - All other threads will now come out of resync wait loop and proceed
+ *   with individual execution.
+ * - On return of this function, primary thread will signal all
+ *   secondary threads to proceed.
+ * - All secondary threads will eventually call opal hmi handler on
+ *   their exit path.
+ */
+
+long kvmppc_realmode_hmi_handler(void)
+{
+	int ptid = local_paca->kvm_hstate.ptid;
+	bool resync_req;
+
+	/* This is only called on primary thread. */
+	BUG_ON(ptid != 0);
+	__this_cpu_inc(irq_stat.hmi_exceptions);
+
+	/*
+	 * By now primary thread has already completed guest->host
+	 * partition switch but haven't signaled secondaries yet.
+	 * All the secondary threads on this subcore is waiting
+	 * for primary thread to signal them to go ahead.
+	 *
+	 * For threads from subcore which isn't in guest, they all will
+	 * wait until all other subcores on this core exit the guest.
+	 *
+	 * Now set the resync required bit. If you are the first to
+	 * set this bit then kvmppc_tb_resync_required() function will
+	 * return true. For rest all other subcores
+	 * kvmppc_tb_resync_required() will return false.
+	 *
+	 * If resync_req == true, then this thread is responsible to
+	 * initiate TB resync after hmi handler has completed.
+	 * All other threads on this core will wait until this thread
+	 * clears the resync required bit flag.
+	 */
+	resync_req = kvmppc_tb_resync_required();
+
+	/* Reset the subcore status to indicate it has exited guest */
+	kvmppc_subcore_exit_guest();
+
+	/*
+	 * Wait for other subcores on this core to exit the guest.
+	 * All the primary threads and threads from subcore that are
+	 * not in guest will wait here until all subcores are out
+	 * of guest context.
+	 */
+	wait_for_subcore_guest_exit();
+
+	/*
+	 * At this point we are sure that primary threads from each
+	 * subcore on this core have completed guest->host partition
+	 * switch. Now it is safe to call HMI handler.
+	 */
+	if (ppc_md.hmi_exception_early)
+		ppc_md.hmi_exception_early(NULL);
+
+	/*
+	 * Check if this thread is responsible to resync TB.
+	 * All other threads will wait until this thread completes the
+	 * TB resync.
+	 */
+	if (resync_req) {
+		opal_resync_timebase();
+		/* Reset TB resync req bit */
+		kvmppc_tb_resync_done();
+	} else {
+		wait_for_tb_resync();
+	}
+	return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e571ad277398fd..0d246fca157a96 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -29,6 +29,7 @@
 #include <asm/kvm_book3s_asm.h>
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/tm.h>
+#include <asm/opal.h>
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
@@ -373,6 +374,18 @@ kvm_secondary_got_guest:
 	lwsync
 	std	r0, HSTATE_KVM_VCORE(r13)
 
+	/*
+	 * All secondaries exiting guest will fall through this path.
+	 * Before proceeding, just check for HMI interrupt and
+	 * invoke opal hmi handler. By now we are sure that the
+	 * primary thread on this core/subcore has already made partition
+	 * switch/TB resync and we are good to call opal hmi handler.
+	 */
+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
+	bne	kvm_no_guest
+
+	li	r3,0			/* NULL argument */
+	bl	hmi_exception_realmode
 /*
  * At this point we have finished executing in the guest.
  * We need to wait for hwthread_req to become zero, since
@@ -427,6 +440,22 @@ kvm_no_guest:
  * whole-core mode, so we need to nap.
  */
 kvm_unsplit_nap:
+	/*
+	 * When secondaries are napping in kvm_unsplit_nap() with
+	 * hwthread_req = 1, HMI goes ignored even though subcores are
+	 * already exited the guest. Hence HMI keeps waking up secondaries
+	 * from nap in a loop and secondaries always go back to nap since
+	 * no vcore is assigned to them. This makes impossible for primary
+	 * thread to get hold of secondary threads resulting into a soft
+	 * lockup in KVM path.
+	 *
+	 * Let us check if HMI is pending and handle it before we go to nap.
+	 */
+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
+	bne	55f
+	li	r3, 0			/* NULL argument */
+	bl	hmi_exception_realmode
+55:
 	/*
 	 * Ensure that secondary doesn't nap when it has
 	 * its vcore pointer set.
@@ -601,6 +630,11 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+	/* Mark the subcore state as inside guest */
+	bl	kvmppc_subcore_enter_guest
+	nop
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r4, HSTATE_KVM_VCPU(r13)
 	li	r0,1
 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
 
@@ -1683,6 +1717,23 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+	/* If HMI, call kvmppc_realmode_hmi_handler() */
+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
+	bne	27f
+	bl	kvmppc_realmode_hmi_handler
+	nop
+	li	r12, BOOK3S_INTERRUPT_HMI
+	/*
+	 * At this point kvmppc_realmode_hmi_handler would have resync-ed
+	 * the TB. Hence it is not required to subtract guest timebase
+	 * offset from timebase. So, skip it.
+	 *
+	 * Also, do not call kvmppc_subcore_exit_guest() because it has
+	 * been invoked as part of kvmppc_realmode_hmi_handler().
+	 */
+	b	30f
+
+27:
 	/* Subtract timebase offset from timebase */
 	ld	r8,VCORE_TB_OFFSET(r5)
 	cmpdi	r8,0
@@ -1698,8 +1749,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	addis	r8,r8,0x100		/* if so, increment upper 40 bits */
 	mtspr	SPRN_TBU40,r8
 
+17:	bl	kvmppc_subcore_exit_guest
+	nop
+30:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
+
 	/* Reset PCR */
-17:	ld	r0, VCORE_PCR(r5)
+	ld	r0, VCORE_PCR(r5)
 	cmpdi	r0, 0
 	beq	18f
 	li	r0, 0
@@ -2461,6 +2517,8 @@ BEGIN_FTR_SECTION
 	cmpwi	r6, 3			/* hypervisor doorbell? */
 	beq	3f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	cmpwi	r6, 0xa			/* Hypervisor maintenance ? */
+	beq	4f
 	li	r3, 1			/* anything else, return 1 */
 0:	blr
 
@@ -2482,6 +2540,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	li	r3, -1
 	blr
 
+	/* Woken up due to Hypervisor maintenance interrupt */
+4:	li	r12, BOOK3S_INTERRUPT_HMI
+	li	r3, 1
+	blr
+
 /*
  * Determine what sort of external interrupt is pending (if any).
  * Returns:

From 414d3b07496604a4372466a6b474ca24291a143c Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:52:54 +0100
Subject: [PATCH 116/302] s390/kvm: page table invalidation notifier

Pass an address range to the page table invalidation notifier
for KVM. This allows to notify changes that affect a larger
virtual memory area, e.g. for 1MB pages.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  3 ++-
 arch/s390/kvm/kvm-s390.c     | 18 +++++++++++++-----
 arch/s390/mm/gmap.c          | 19 ++++++++++++++++---
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index d054c1b07a3c39..bc0eadf9ed8e85 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -39,7 +39,8 @@ struct gmap {
  */
 struct gmap_notifier {
 	struct list_head list;
-	void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+	void (*notifier_call)(struct gmap *gmap, unsigned long start,
+			      unsigned long end);
 };
 
 struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0dcf9b8fc12c2e..67f1b6b4c060a8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -150,7 +150,8 @@ int kvm_arch_hardware_enable(void)
 	return 0;
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+			      unsigned long end);
 
 /*
  * This callback is executed during stop_machine(). All CPUs are therefore
@@ -1976,16 +1977,23 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
 	kvm_s390_vcpu_request(vcpu);
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+			      unsigned long end)
 {
-	int i;
 	struct kvm *kvm = gmap->private;
 	struct kvm_vcpu *vcpu;
+	unsigned long prefix;
+	int i;
 
+	if (start >= 1UL << 31)
+		/* We are only interested in prefix pages */
+		return;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		/* match against both prefix pages */
-		if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+		prefix = kvm_s390_get_prefix(vcpu);
+		if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+				   start, end);
 			kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
 		}
 	}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index cace818d86eb95..b5820bf47ec69a 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -572,6 +572,21 @@ void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
 }
 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+			       unsigned long end)
+{
+	struct gmap_notifier *nb;
+
+	list_for_each_entry(nb, &gmap_notifier_list, list)
+		nb->notifier_call(gmap, start, end);
+}
+
 /**
  * gmap_ipte_notify - mark a range of ptes for invalidation notification
  * @gmap: pointer to guest mapping meta data structure
@@ -643,7 +658,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
 {
 	unsigned long offset, gaddr;
 	unsigned long *table;
-	struct gmap_notifier *nb;
 	struct gmap *gmap;
 
 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
@@ -655,8 +669,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
 		if (!table)
 			continue;
 		gaddr = __gmap_segment_gaddr(table) + offset;
-		list_for_each_entry(nb, &gmap_notifier_list, list)
-			nb->notifier_call(gmap, gaddr);
+		gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
 	}
 	spin_unlock(&gmap_notifier_lock);
 }

From 8ecb1a59d6c6674bc98e4eee0c2482490748e21a Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:54:14 +0100
Subject: [PATCH 117/302] s390/mm: use RCU for gmap notifier list and the
 per-mm gmap list

The gmap notifier list and the gmap list in the mm_struct change rarely.
Use RCU to optimize the reader of these lists.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h        |  1 +
 arch/s390/include/asm/mmu.h         | 11 +++++---
 arch/s390/include/asm/mmu_context.h |  3 ++-
 arch/s390/mm/gmap.c                 | 39 +++++++++++++++++------------
 arch/s390/mm/pgalloc.c              | 16 ++++++------
 5 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index bc0eadf9ed8e85..2cf49624af9944 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -39,6 +39,7 @@ struct gmap {
  */
 struct gmap_notifier {
 	struct list_head list;
+	struct rcu_head rcu;
 	void (*notifier_call)(struct gmap *gmap, unsigned long start,
 			      unsigned long end);
 };
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 081b2ad99d7377..b941528cc49e69 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -8,8 +8,9 @@ typedef struct {
 	cpumask_t cpu_attach_mask;
 	atomic_t attach_count;
 	unsigned int flush_mm;
-	spinlock_t list_lock;
+	spinlock_t pgtable_lock;
 	struct list_head pgtable_list;
+	spinlock_t gmap_lock;
 	struct list_head gmap_list;
 	unsigned long asce;
 	unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
 	unsigned int use_skey:1;
 } mm_context_t;
 
-#define INIT_MM_CONTEXT(name)						      \
-	.context.list_lock    = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
-	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list),    \
+#define INIT_MM_CONTEXT(name)						   \
+	.context.pgtable_lock =						   \
+			__SPIN_LOCK_UNLOCKED(name.context.pgtable_lock),   \
+	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+	.context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
 	.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
 
 static inline int tprot(unsigned long addr)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index c837b79b455dc8..3ce3854b7a41b8 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,8 +15,9 @@
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
-	spin_lock_init(&mm->context.list_lock);
+	spin_lock_init(&mm->context.pgtable_lock);
 	INIT_LIST_HEAD(&mm->context.pgtable_list);
+	spin_lock_init(&mm->context.gmap_lock);
 	INIT_LIST_HEAD(&mm->context.gmap_list);
 	cpumask_clear(&mm->context.cpu_attach_mask);
 	atomic_set(&mm->context.attach_count, 0);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index b5820bf47ec69a..8b56423a829795 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -70,9 +70,9 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
 		_ASCE_USER_BITS | __pa(table);
 	gmap->asce_end = limit;
-	down_write(&mm->mmap_sem);
-	list_add(&gmap->list, &mm->context.gmap_list);
-	up_write(&mm->mmap_sem);
+	spin_lock(&mm->context.gmap_lock);
+	list_add_rcu(&gmap->list, &mm->context.gmap_list);
+	spin_unlock(&mm->context.gmap_lock);
 	return gmap;
 
 out_free:
@@ -128,14 +128,16 @@ void gmap_free(struct gmap *gmap)
 	else
 		__tlb_flush_global();
 
+	spin_lock(&gmap->mm->context.gmap_lock);
+	list_del_rcu(&gmap->list);
+	spin_unlock(&gmap->mm->context.gmap_lock);
+	synchronize_rcu();
+
 	/* Free all segment & region tables. */
 	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 		__free_pages(page, 2);
 	gmap_radix_tree_free(&gmap->guest_to_host);
 	gmap_radix_tree_free(&gmap->host_to_guest);
-	down_write(&gmap->mm->mmap_sem);
-	list_del(&gmap->list);
-	up_write(&gmap->mm->mmap_sem);
 	kfree(gmap);
 }
 EXPORT_SYMBOL_GPL(gmap_free);
@@ -369,11 +371,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 	struct gmap *gmap;
 	int flush;
 
-	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
 		if (flush)
 			gmap_flush_tlb(gmap);
 	}
+	rcu_read_unlock();
 }
 
 /**
@@ -555,7 +559,7 @@ static DEFINE_SPINLOCK(gmap_notifier_lock);
 void gmap_register_ipte_notifier(struct gmap_notifier *nb)
 {
 	spin_lock(&gmap_notifier_lock);
-	list_add(&nb->list, &gmap_notifier_list);
+	list_add_rcu(&nb->list, &gmap_notifier_list);
 	spin_unlock(&gmap_notifier_lock);
 }
 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
@@ -567,8 +571,9 @@ EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
 void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
 {
 	spin_lock(&gmap_notifier_lock);
-	list_del_init(&nb->list);
+	list_del_rcu(&nb->list);
 	spin_unlock(&gmap_notifier_lock);
+	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 
@@ -662,16 +667,18 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
 
 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
 	offset = offset * (4096 / sizeof(pte_t));
-	spin_lock(&gmap_notifier_lock);
-	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
 		table = radix_tree_lookup(&gmap->host_to_guest,
 					  vmaddr >> PMD_SHIFT);
-		if (!table)
-			continue;
-		gaddr = __gmap_segment_gaddr(table) + offset;
-		gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+		if (table)
+			gaddr = __gmap_segment_gaddr(table) + offset;
+		spin_unlock(&gmap->guest_table_lock);
+		if (table)
+			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
 	}
-	spin_unlock(&gmap_notifier_lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ptep_notify);
 
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index e8b5962ac12ab8..7be1f94f70a8ce 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -149,7 +149,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	/* Try to get a fragment of a 4K page as a 2K page table */
 	if (!mm_alloc_pgste(mm)) {
 		table = NULL;
-		spin_lock_bh(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.pgtable_lock);
 		if (!list_empty(&mm->context.pgtable_list)) {
 			page = list_first_entry(&mm->context.pgtable_list,
 						struct page, lru);
@@ -164,7 +164,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 				list_del(&page->lru);
 			}
 		}
-		spin_unlock_bh(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.pgtable_lock);
 		if (table)
 			return table;
 	}
@@ -187,9 +187,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 		/* Return the first 2K fragment of the page */
 		atomic_set(&page->_mapcount, 1);
 		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-		spin_lock_bh(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.pgtable_lock);
 		list_add(&page->lru, &mm->context.pgtable_list);
-		spin_unlock_bh(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.pgtable_lock);
 	}
 	return table;
 }
@@ -203,13 +203,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	if (!mm_alloc_pgste(mm)) {
 		/* Free 2K page table fragment of a 4K page */
 		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-		spin_lock_bh(&mm->context.list_lock);
+		spin_lock_bh(&mm->context.pgtable_lock);
 		mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
 		if (mask & 3)
 			list_add(&page->lru, &mm->context.pgtable_list);
 		else
 			list_del(&page->lru);
-		spin_unlock_bh(&mm->context.list_lock);
+		spin_unlock_bh(&mm->context.pgtable_lock);
 		if (mask != 0)
 			return;
 	}
@@ -235,13 +235,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 		return;
 	}
 	bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-	spin_lock_bh(&mm->context.list_lock);
+	spin_lock_bh(&mm->context.pgtable_lock);
 	mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
 	if (mask & 3)
 		list_add_tail(&page->lru, &mm->context.pgtable_list);
 	else
 		list_del(&page->lru);
-	spin_unlock_bh(&mm->context.list_lock);
+	spin_unlock_bh(&mm->context.pgtable_lock);
 	table = (unsigned long *) (__pa(table) | (1U << bit));
 	tlb_remove_table(tlb, table);
 }

From b2d73b2a0ad1c758cb0c1acb01a911744b845942 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:54:42 +0100
Subject: [PATCH 118/302] s390/mm: extended gmap pte notifier

The current gmap pte notifier forces a pte into to a read-write state.
If the pte is invalidated the gmap notifier is called to inform KVM
that the mapping will go away.

Extend this approach to allow read-write, read-only and no-access
as possible target states and call the pte notifier for any change
to the pte.

This mechanism is used to temporarily set specific access rights for
a pte without doing the heavy work of a true mprotect call.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h    |   9 +-
 arch/s390/include/asm/pgtable.h |   2 +
 arch/s390/kvm/kvm-s390.c        |  13 +--
 arch/s390/mm/gmap.c             | 170 ++++++++++++++++++++++++--------
 arch/s390/mm/pgtable.c          |  54 +++++++++-
 5 files changed, 193 insertions(+), 55 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 2cf49624af9944..6897a0919446e3 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -59,8 +59,11 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
 void __gmap_zap(struct gmap *, unsigned long gaddr);
 void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
 
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+			 unsigned long len, int prot);
 
 #endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9951e7e5975632..35dde6afffcf95 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -886,6 +886,8 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry);
 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+		    pte_t *ptep, int prot);
 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep , int reset);
 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 67f1b6b4c060a8..b6e7f66f0f0168 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/mman.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/slab.h>
@@ -185,7 +186,7 @@ static struct notifier_block kvm_clock_notifier = {
 int kvm_arch_hardware_setup(void)
 {
 	gmap_notifier.notifier_call = kvm_gmap_notifier;
-	gmap_register_ipte_notifier(&gmap_notifier);
+	gmap_register_pte_notifier(&gmap_notifier);
 	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
 				       &kvm_clock_notifier);
 	return 0;
@@ -193,7 +194,7 @@ int kvm_arch_hardware_setup(void)
 
 void kvm_arch_hardware_unsetup(void)
 {
-	gmap_unregister_ipte_notifier(&gmap_notifier);
+	gmap_unregister_pte_notifier(&gmap_notifier);
 	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
 					 &kvm_clock_notifier);
 }
@@ -2272,16 +2273,16 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 		return 0;
 	/*
 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
-	 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+	 * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
 	 * This ensures that the ipte instruction for this request has
 	 * already finished. We might race against a second unmapper that
 	 * wants to set the blocking bit. Lets just retry the request loop.
 	 */
 	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
 		int rc;
-		rc = gmap_ipte_notify(vcpu->arch.gmap,
-				      kvm_s390_get_prefix(vcpu),
-				      PAGE_SIZE * 2);
+		rc = gmap_mprotect_notify(vcpu->arch.gmap,
+					  kvm_s390_get_prefix(vcpu),
+					  PAGE_SIZE * 2, PROT_WRITE);
 		if (rc)
 			return rc;
 		goto retry;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 8b56423a829795..480c076afceb27 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -553,29 +553,29 @@ static LIST_HEAD(gmap_notifier_list);
 static DEFINE_SPINLOCK(gmap_notifier_lock);
 
 /**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
 {
 	spin_lock(&gmap_notifier_lock);
 	list_add_rcu(&nb->list, &gmap_notifier_list);
 	spin_unlock(&gmap_notifier_lock);
 }
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
 
 /**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
  * @nb: pointer to the gmap notifier block
  */
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
 {
 	spin_lock(&gmap_notifier_lock);
 	list_del_rcu(&nb->list);
 	spin_unlock(&gmap_notifier_lock);
 	synchronize_rcu();
 }
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
 
 /**
  * gmap_call_notifier - call all registered invalidation callbacks
@@ -593,62 +593,150 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
 }
 
 /**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ *
+ * Returns a table pointer for the given guest address.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+					     unsigned long gaddr)
+{
+	unsigned long *table;
+
+	table = gmap->table;
+	switch (gmap->asce & _ASCE_TYPE_MASK) {
+	case _ASCE_TYPE_REGION1:
+		table += (gaddr >> 53) & 0x7ff;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		/* Fallthrough */
+	case _ASCE_TYPE_REGION2:
+		table += (gaddr >> 42) & 0x7ff;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		/* Fallthrough */
+	case _ASCE_TYPE_REGION3:
+		table += (gaddr >> 31) & 0x7ff;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+		/* Fallthrough */
+	case _ASCE_TYPE_SEGMENT:
+		table += (gaddr >> 20) & 0x7ff;
+	}
+	return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ *		      and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+			       spinlock_t **ptl)
+{
+	unsigned long *table;
+
+	/* Walk the gmap page table, lock and get pte pointer */
+	table = gmap_table_walk(gmap, gaddr);
+	if (!table || *table & _SEGMENT_ENTRY_INVALID)
+		return NULL;
+	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+			     unsigned long vmaddr)
+{
+	struct mm_struct *mm = gmap->mm;
+	bool unlocked = false;
+
+	if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked))
+		return -EFAULT;
+	if (unlocked)
+		/* lost mmap_sem, caller has to retry __gmap_translate */
+		return 0;
+	/* Connect the page tables */
+	return __gmap_link(gmap, gaddr, vmaddr);
+}
+
+/**
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+	spin_unlock(ptl);
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ *                        call the notifier if any pte changes again
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
  * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  *
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
  */
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+			 unsigned long len, int prot)
 {
-	unsigned long addr;
+	unsigned long vmaddr;
 	spinlock_t *ptl;
 	pte_t *ptep;
-	bool unlocked;
 	int rc = 0;
 
 	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
 		return -EINVAL;
+	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
+		return -EINVAL;
 	down_read(&gmap->mm->mmap_sem);
 	while (len) {
-		unlocked = false;
-		/* Convert gmap address and connect the page tables */
-		addr = __gmap_translate(gmap, gaddr);
-		if (IS_ERR_VALUE(addr)) {
-			rc = addr;
-			break;
-		}
-		/* Get the page mapped */
-		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-				     &unlocked)) {
-			rc = -EFAULT;
-			break;
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+		if (ptep) {
+			rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot);
+			gmap_pte_op_end(ptl);
 		}
-		/* While trying to map mmap_sem got unlocked. Let us retry */
-		if (unlocked)
+		if (rc) {
+			vmaddr = __gmap_translate(gmap, gaddr);
+			if (IS_ERR_VALUE(vmaddr)) {
+				rc = vmaddr;
+				break;
+			}
+			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
+			if (rc)
+				break;
 			continue;
-		rc = __gmap_link(gmap, gaddr, addr);
-		if (rc)
-			break;
-		/* Walk the process page table, lock and get pte pointer */
-		ptep = get_locked_pte(gmap->mm, addr, &ptl);
-		VM_BUG_ON(!ptep);
-		/* Set notification bit in the pgste of the pte */
-		if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-			ptep_set_notify(gmap->mm, addr, ptep);
-			gaddr += PAGE_SIZE;
-			len -= PAGE_SIZE;
 		}
-		pte_unmap_unlock(ptep, ptl);
+		gaddr += PAGE_SIZE;
+		len -= PAGE_SIZE;
 	}
 	up_read(&gmap->mm->mmap_sem);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
 
 /**
  * ptep_notify - call all invalidation callbacks for a specific pte.
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index fa286d0c0f2da3..ab65fb11e05863 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -179,9 +179,9 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
 	return pgste;
 }
 
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-					unsigned long addr,
-					pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+				       unsigned long addr,
+				       pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
 	if (pgste_val(pgste) & PGSTE_IN_BIT) {
@@ -199,7 +199,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
 
 	if (mm_has_pgste(mm)) {
 		pgste = pgste_get_lock(ptep);
-		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
 	}
 	return pgste;
 }
@@ -414,6 +414,50 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 	pgste_set_unlock(ptep, pgste);
 }
 
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, int prot)
+{
+	pte_t entry;
+	pgste_t pgste;
+	int pte_i, pte_p;
+
+	pgste = pgste_get_lock(ptep);
+	entry = *ptep;
+	/* Check pte entry after all locks have been acquired */
+	pte_i = pte_val(entry) & _PAGE_INVALID;
+	pte_p = pte_val(entry) & _PAGE_PROTECT;
+	if ((pte_i && (prot != PROT_NONE)) ||
+	    (pte_p && (prot & PROT_WRITE))) {
+		pgste_set_unlock(ptep, pgste);
+		return -EAGAIN;
+	}
+	/* Change access rights and set the pgste notification bit */
+	if (prot == PROT_NONE && !pte_i) {
+		ptep_flush_direct(mm, addr, ptep);
+		pgste = pgste_update_all(entry, pgste, mm);
+		pte_val(entry) |= _PAGE_INVALID;
+	}
+	if (prot == PROT_READ && !pte_p) {
+		ptep_flush_direct(mm, addr, ptep);
+		pte_val(entry) &= ~_PAGE_INVALID;
+		pte_val(entry) |= _PAGE_PROTECT;
+	}
+	pgste_val(pgste) |= PGSTE_IN_BIT;
+	pgste = pgste_set_pte(ptep, pgste, entry);
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 {
 	if (!non_swap_entry(entry))
@@ -483,7 +527,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
 	pgste_val(pgste) &= ~PGSTE_UC_BIT;
 	pte = *ptep;
 	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
 		__ptep_ipte(addr, ptep);
 		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
 			pte_val(pte) |= _PAGE_PROTECT;

From 6ea427bbbd4078297bb1dbd6c5cb83f3f48aac46 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:55:04 +0100
Subject: [PATCH 119/302] s390/mm: add reference counter to gmap structure

Let's use a reference counter mechanism to control the lifetime of
gmap structures. This will be needed for further changes related to
gmap shadows.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  9 +++-
 arch/s390/kvm/kvm-s390.c     | 16 +++----
 arch/s390/mm/gmap.c          | 90 ++++++++++++++++++++++++++++--------
 3 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 6897a0919446e3..e69853ce55dae3 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -15,6 +15,7 @@
  * @guest_to_host: radix tree with guest to host address translation
  * @host_to_guest: radix tree with pointer to segment table entries
  * @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @pfault_enabled: defines if pfaults are applicable for the guest
@@ -26,6 +27,7 @@ struct gmap {
 	struct radix_tree_root guest_to_host;
 	struct radix_tree_root host_to_guest;
 	spinlock_t guest_table_lock;
+	atomic_t ref_count;
 	unsigned long *table;
 	unsigned long asce;
 	unsigned long asce_end;
@@ -44,8 +46,11 @@ struct gmap_notifier {
 			      unsigned long end);
 };
 
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
 void gmap_enable(struct gmap *gmap);
 void gmap_disable(struct gmap *gmap);
 int gmap_map_segment(struct gmap *gmap, unsigned long from,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b6e7f66f0f0168..9dd52980605cb0 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -532,20 +532,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		if (!new_limit)
 			return -EINVAL;
 
-		/* gmap_alloc takes last usable address */
+		/* gmap_create takes last usable address */
 		if (new_limit != KVM_S390_NO_MEM_LIMIT)
 			new_limit -= 1;
 
 		ret = -EBUSY;
 		mutex_lock(&kvm->lock);
 		if (!kvm->created_vcpus) {
-			/* gmap_alloc will round the limit up */
-			struct gmap *new = gmap_alloc(current->mm, new_limit);
+			/* gmap_create will round the limit up */
+			struct gmap *new = gmap_create(current->mm, new_limit);
 
 			if (!new) {
 				ret = -ENOMEM;
 			} else {
-				gmap_free(kvm->arch.gmap);
+				gmap_remove(kvm->arch.gmap);
 				new->private = kvm;
 				kvm->arch.gmap = new;
 				ret = 0;
@@ -1394,7 +1394,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 		else
 			kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
 						    sclp.hamax + 1);
-		kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+		kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
 		if (!kvm->arch.gmap)
 			goto out_err;
 		kvm->arch.gmap->private = kvm;
@@ -1427,7 +1427,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 		sca_del_vcpu(vcpu);
 
 	if (kvm_is_ucontrol(vcpu->kvm))
-		gmap_free(vcpu->arch.gmap);
+		gmap_remove(vcpu->arch.gmap);
 
 	if (vcpu->kvm->arch.use_cmma)
 		kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1460,7 +1460,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	debug_unregister(kvm->arch.dbf);
 	free_page((unsigned long)kvm->arch.sie_page2);
 	if (!kvm_is_ucontrol(kvm))
-		gmap_free(kvm->arch.gmap);
+		gmap_remove(kvm->arch.gmap);
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
@@ -1469,7 +1469,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 /* Section: vcpu related */
 static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+	vcpu->arch.gmap = gmap_create(current->mm, -1UL);
 	if (!vcpu->arch.gmap)
 		return -ENOMEM;
 	vcpu->arch.gmap->private = vcpu->kvm;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 480c076afceb27..fe25f1915800a8 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -21,13 +21,13 @@
 #include <asm/tlb.h>
 
 /**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
  * @mm: pointer to the parent mm_struct
  * @limit: maximum address of the gmap address space
  *
  * Returns a guest address space structure.
  */
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
 {
 	struct gmap *gmap;
 	struct page *page;
@@ -58,7 +58,7 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
 	spin_lock_init(&gmap->guest_table_lock);
-	gmap->mm = mm;
+	atomic_set(&gmap->ref_count, 1);
 	page = alloc_pages(GFP_KERNEL, 2);
 	if (!page)
 		goto out_free;
@@ -70,9 +70,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 	gmap->asce = atype | _ASCE_TABLE_LENGTH |
 		_ASCE_USER_BITS | __pa(table);
 	gmap->asce_end = limit;
-	spin_lock(&mm->context.gmap_lock);
-	list_add_rcu(&gmap->list, &mm->context.gmap_list);
-	spin_unlock(&mm->context.gmap_lock);
 	return gmap;
 
 out_free:
@@ -80,7 +77,28 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
 out:
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+	struct gmap *gmap;
+
+	gmap = gmap_alloc(limit);
+	if (!gmap)
+		return NULL;
+	gmap->mm = mm;
+	spin_lock(&mm->context.gmap_lock);
+	list_add_rcu(&gmap->list, &mm->context.gmap_list);
+	spin_unlock(&mm->context.gmap_lock);
+	return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
 
 static void gmap_flush_tlb(struct gmap *gmap)
 {
@@ -118,21 +136,10 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
  * gmap_free - free a guest address space
  * @gmap: pointer to the guest address space structure
  */
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
 {
 	struct page *page, *next;
 
-	/* Flush tlb. */
-	if (MACHINE_HAS_IDTE)
-		__tlb_flush_asce(gmap->mm, gmap->asce);
-	else
-		__tlb_flush_global();
-
-	spin_lock(&gmap->mm->context.gmap_lock);
-	list_del_rcu(&gmap->list);
-	spin_unlock(&gmap->mm->context.gmap_lock);
-	synchronize_rcu();
-
 	/* Free all segment & region tables. */
 	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 		__free_pages(page, 2);
@@ -140,7 +147,50 @@ void gmap_free(struct gmap *gmap)
 	gmap_radix_tree_free(&gmap->host_to_guest);
 	kfree(gmap);
 }
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+	atomic_inc(&gmap->ref_count);
+	return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+	if (atomic_dec_return(&gmap->ref_count) == 0)
+		gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+	/* Flush tlb. */
+	gmap_flush_tlb(gmap);
+	/* Remove gmap from the pre-mm list */
+	spin_lock(&gmap->mm->context.gmap_lock);
+	list_del_rcu(&gmap->list);
+	spin_unlock(&gmap->mm->context.gmap_lock);
+	synchronize_rcu();
+	/* Put reference */
+	gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
 
 /**
  * gmap_enable - switch primary space to the guest address space

From 4be130a08420d6918d80c1067f8078f425eb98df Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 12:12:18 +0100
Subject: [PATCH 120/302] s390/mm: add shadow gmap support

For a nested KVM guest the outer KVM host needs to create shadow
page tables for the nested guest. This patch adds the basic support
to the guest address space (gmap) code.

For each guest address space the inner KVM host creates, the first
outer KVM host needs to create shadow page tables. The address space
is identified by the ASCE loaded into the control register 1 at the
time the inner SIE instruction for the second nested KVM guest is
executed. The outer KVM host creates the shadow tables starting with
the table identified by the ASCE on a on-demand basis. The outer KVM
host will get repeated faults for all the shadow tables needed to
run the second KVM guest.

While a shadow page table for the second KVM guest is active the access
to the origin region, segment and page tables needs to be restricted
for the first KVM guest. For region and segment and page tables the first
KVM guest may read the memory, but write attempt has to lead to an
unshadow.  This is done using the page invalid and read-only bits in the
page table of the first KVM guest. If the first guest re-accesses one of
the origin pages of a shadow, it gets a fault and the affected parts of
the shadow page table hierarchy needs to be removed again.

PGSTE tables don't have to be shadowed, as all interpretation assist can't
deal with the invalid bits in the shadow pte being set differently than
the original ones provided by the first KVM guest.

Many bug fixes and improvements by David Hildenbrand.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h      |   52 +-
 arch/s390/include/asm/pgalloc.h   |    2 +
 arch/s390/include/asm/pgtable.h   |   10 +-
 arch/s390/include/asm/processor.h |    1 +
 arch/s390/mm/fault.c              |    1 +
 arch/s390/mm/gmap.c               | 1150 ++++++++++++++++++++++++++++-
 arch/s390/mm/pgalloc.c            |   23 +
 arch/s390/mm/pgtable.c            |   57 +-
 8 files changed, 1262 insertions(+), 34 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index e69853ce55dae3..58e65ee5b2d25e 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -10,6 +10,7 @@
 
 /**
  * struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
  * @crst_list: list of all crst tables used in the guest address space
  * @mm: pointer to the parent mm_struct
  * @guest_to_host: radix tree with guest to host address translation
@@ -19,6 +20,13 @@
  * @table: pointer to the page directory
  * @asce: address space control element for gmap page table
  * @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @removed: flag to indicate if a shadow guest address space has been removed
  */
 struct gmap {
 	struct list_head list;
@@ -33,8 +41,32 @@ struct gmap {
 	unsigned long asce_end;
 	void *private;
 	bool pfault_enabled;
+	/* Additional data for shadow guest address spaces */
+	struct radix_tree_root host_to_rmap;
+	struct list_head children;
+	struct list_head pt_list;
+	spinlock_t shadow_lock;
+	struct gmap *parent;
+	unsigned long orig_asce;
+	bool removed;
 };
 
+/**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+	struct gmap_rmap *next;
+	unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+	for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+	for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
 /**
  * struct gmap_notifier - notify function block for page invalidation
  * @notifier_call: address of callback function
@@ -46,6 +78,11 @@ struct gmap_notifier {
 			      unsigned long end);
 };
 
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+	return !!gmap->parent;
+}
+
 struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
 void gmap_remove(struct gmap *gmap);
 struct gmap *gmap_get(struct gmap *gmap);
@@ -64,9 +101,22 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
 void __gmap_zap(struct gmap *, unsigned long gaddr);
 void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
 
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+			   unsigned long *pgt, int *dat_protection);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
+		     unsigned long paddr, int write);
+
 void gmap_register_pte_notifier(struct gmap_notifier *);
 void gmap_unregister_pte_notifier(struct gmap_notifier *);
-void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+		     unsigned long bits);
 
 int gmap_mprotect_notify(struct gmap *, unsigned long start,
 			 unsigned long len, int prot);
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index da34cb6b1f3b03..f4eb9843eed4a6 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
 void crst_table_free(struct mm_struct *, unsigned long *);
 
 unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
 extern int page_table_allocate_pgste;
 
 static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 35dde6afffcf95..a6e7fc8f5b495c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -256,6 +256,7 @@ static inline int is_module_addr(void *addr)
 /* Bits in the region table entry */
 #define _REGION_ENTRY_ORIGIN	~0xfffUL/* region/segment table origin	    */
 #define _REGION_ENTRY_PROTECT	0x200	/* region protection bit	    */
+#define _REGION_ENTRY_OFFSET	0xc0	/* region table offset		    */
 #define _REGION_ENTRY_INVALID	0x20	/* invalid region table entry	    */
 #define _REGION_ENTRY_TYPE_MASK	0x0c	/* region/segment table type mask   */
 #define _REGION_ENTRY_TYPE_R1	0x0c	/* region first table type	    */
@@ -327,6 +328,7 @@ static inline int is_module_addr(void *addr)
 #define PGSTE_GC_BIT	0x0002000000000000UL
 #define PGSTE_UC_BIT	0x0000800000000000UL	/* user dirty (migration) */
 #define PGSTE_IN_BIT	0x0000400000000000UL	/* IPTE notify bit */
+#define PGSTE_VSIE_BIT	0x0000200000000000UL	/* ref'd in a shadow table */
 
 /* Guest Page State used for virtualization */
 #define _PGSTE_GPS_ZERO		0x0000000080000000UL
@@ -885,12 +887,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry);
 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+		 pte_t *ptep, unsigned long bits);
 int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
-		    pte_t *ptep, int prot);
+		    pte_t *ptep, int prot, unsigned long bit);
 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep , int reset);
 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+		    pte_t *sptep, pte_t *tptep, int write);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
 
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 9d4d311d7e522d..94c80b6d031d13 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -109,6 +109,7 @@ struct thread_struct {
         unsigned long ksp;              /* kernel stack pointer             */
 	mm_segment_t mm_segment;
 	unsigned long gmap_addr;	/* address of last gmap fault. */
+	unsigned int gmap_write_flag;	/* gmap fault write indication */
 	unsigned int gmap_pfault;	/* signal of a pending guest pfault */
 	struct per_regs per_user;	/* User specified PER registers */
 	struct per_event per_event;	/* Cause of the last PER trap */
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 19288c1b36d32b..b84416c11c434d 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
 		(struct gmap *) S390_lowcore.gmap : NULL;
 	if (gmap) {
 		current->thread.gmap_addr = address;
+		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
 		address = __gmap_translate(gmap, address);
 		if (address == -EFAULT) {
 			fault = VM_FAULT_BADMAP;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index fe25f1915800a8..6695a09a3885c5 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -55,9 +55,13 @@ static struct gmap *gmap_alloc(unsigned long limit)
 	if (!gmap)
 		goto out;
 	INIT_LIST_HEAD(&gmap->crst_list);
+	INIT_LIST_HEAD(&gmap->children);
+	INIT_LIST_HEAD(&gmap->pt_list);
 	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
 	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
 	spin_lock_init(&gmap->guest_table_lock);
+	spin_lock_init(&gmap->shadow_lock);
 	atomic_set(&gmap->ref_count, 1);
 	page = alloc_pages(GFP_KERNEL, 2);
 	if (!page)
@@ -132,9 +136,38 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
 	} while (nr > 0);
 }
 
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+	struct gmap_rmap *rmap, *rnext, *head;
+	struct radix_tree_iter iter;
+	unsigned long indices[16];
+	unsigned long index;
+	void **slot;
+	int i, nr;
+
+	/* A radix tree is freed by deleting all of its entries */
+	index = 0;
+	do {
+		nr = 0;
+		radix_tree_for_each_slot(slot, root, &iter, index) {
+			indices[nr] = iter.index;
+			if (++nr == 16)
+				break;
+		}
+		for (i = 0; i < nr; i++) {
+			index = indices[i];
+			head = radix_tree_delete(root, index);
+			gmap_for_each_rmap_safe(rmap, rnext, head)
+				kfree(rmap);
+		}
+	} while (nr > 0);
+}
+
 /**
  * gmap_free - free a guest address space
  * @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
  */
 static void gmap_free(struct gmap *gmap)
 {
@@ -145,6 +178,17 @@ static void gmap_free(struct gmap *gmap)
 		__free_pages(page, 2);
 	gmap_radix_tree_free(&gmap->guest_to_host);
 	gmap_radix_tree_free(&gmap->host_to_guest);
+
+	/* Free additional data for a shadow gmap */
+	if (gmap_is_shadow(gmap)) {
+		/* Free all page tables. */
+		list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+			page_table_free_pgste(page);
+		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+		/* Release reference to the parent */
+		gmap_put(gmap->parent);
+	}
+
 	kfree(gmap);
 }
 
@@ -180,8 +224,20 @@ EXPORT_SYMBOL_GPL(gmap_put);
  */
 void gmap_remove(struct gmap *gmap)
 {
+	struct gmap *sg, *next;
+
 	/* Flush tlb. */
 	gmap_flush_tlb(gmap);
+	/* Remove all shadow gmaps linked to this gmap */
+	if (!list_empty(&gmap->children)) {
+		spin_lock(&gmap->shadow_lock);
+		list_for_each_entry_safe(sg, next, &gmap->children, list) {
+			gmap_flush_tlb(sg);
+			list_del(&sg->list);
+			gmap_put(sg);
+		}
+		spin_unlock(&gmap->shadow_lock);
+	}
 	/* Remove gmap from the pre-mm list */
 	spin_lock(&gmap->mm->context.gmap_lock);
 	list_del_rcu(&gmap->list);
@@ -227,7 +283,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 		return -ENOMEM;
 	new = (unsigned long *) page_to_phys(page);
 	crst_table_init(new, init);
-	spin_lock(&gmap->mm->page_table_lock);
+	spin_lock(&gmap->guest_table_lock);
 	if (*table & _REGION_ENTRY_INVALID) {
 		list_add(&page->lru, &gmap->crst_list);
 		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -235,7 +291,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 		page->index = gaddr;
 		page = NULL;
 	}
-	spin_unlock(&gmap->mm->page_table_lock);
+	spin_unlock(&gmap->guest_table_lock);
 	if (page)
 		__free_pages(page, 2);
 	return 0;
@@ -271,6 +327,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 	unsigned long *entry;
 	int flush = 0;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	spin_lock(&gmap->guest_table_lock);
 	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 	if (entry) {
@@ -310,6 +367,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 	unsigned long off;
 	int flush;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	if ((to | len) & (PMD_SIZE - 1))
 		return -EINVAL;
 	if (len == 0 || to + len < to)
@@ -341,6 +399,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
 	unsigned long off;
 	int flush;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	if ((from | to | len) & (PMD_SIZE - 1))
 		return -EINVAL;
 	if (len == 0 || from + len < from || to + len < to ||
@@ -378,6 +437,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
  * This function does not establish potentially missing page table entries.
  * The mmap_sem of the mm that belongs to the address space must be held
  * when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
  */
 unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 {
@@ -385,6 +446,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 
 	vmaddr = (unsigned long)
 		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+	/* Note: guest_to_host is empty for a shadow gmap */
 	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 }
 EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -451,6 +513,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	pmd_t *pmd;
 	int rc;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	/* Create higher level tables in the gmap page table */
 	table = gmap->table;
 	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -646,36 +709,65 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
  * gmap_table_walk - walk the gmap page tables
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
  *
- * Returns a table pointer for the given guest address.
+ * Note: Can also be called for shadow gmaps.
  */
 static inline unsigned long *gmap_table_walk(struct gmap *gmap,
-					     unsigned long gaddr)
+					     unsigned long gaddr, int level)
 {
 	unsigned long *table;
 
+	if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+		return NULL;
+	if (gmap_is_shadow(gmap) && gmap->removed)
+		return NULL;
+	if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+		return NULL;
 	table = gmap->table;
 	switch (gmap->asce & _ASCE_TYPE_MASK) {
 	case _ASCE_TYPE_REGION1:
 		table += (gaddr >> 53) & 0x7ff;
+		if (level == 4)
+			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 		/* Fallthrough */
 	case _ASCE_TYPE_REGION2:
 		table += (gaddr >> 42) & 0x7ff;
+		if (level == 3)
+			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 		/* Fallthrough */
 	case _ASCE_TYPE_REGION3:
 		table += (gaddr >> 31) & 0x7ff;
+		if (level == 2)
+			break;
 		if (*table & _REGION_ENTRY_INVALID)
 			return NULL;
 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 		/* Fallthrough */
 	case _ASCE_TYPE_SEGMENT:
 		table += (gaddr >> 20) & 0x7ff;
+		if (level == 1)
+			break;
+		if (*table & _REGION_ENTRY_INVALID)
+			return NULL;
+		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+		table += (gaddr >> 12) & 0xff;
 	}
 	return table;
 }
@@ -688,16 +780,27 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
  * @ptl: pointer to the spinlock pointer
  *
  * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
  */
 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
 			       spinlock_t **ptl)
 {
 	unsigned long *table;
 
+	if (gmap_is_shadow(gmap))
+		spin_lock(&gmap->guest_table_lock);
 	/* Walk the gmap page table, lock and get pte pointer */
-	table = gmap_table_walk(gmap, gaddr);
-	if (!table || *table & _SEGMENT_ENTRY_INVALID)
+	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+	if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+		if (gmap_is_shadow(gmap))
+			spin_unlock(&gmap->guest_table_lock);
 		return NULL;
+	}
+	if (gmap_is_shadow(gmap)) {
+		*ptl = &gmap->guest_table_lock;
+		return pte_offset_map((pmd_t *) table, gaddr);
+	}
 	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
 }
 
@@ -717,6 +820,7 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 	struct mm_struct *mm = gmap->mm;
 	bool unlocked = false;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked))
 		return -EFAULT;
 	if (unlocked)
@@ -735,6 +839,51 @@ static void gmap_pte_op_end(spinlock_t *ptl)
 	spin_unlock(ptl);
 }
 
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+			      unsigned long len, int prot, unsigned long bits)
+{
+	unsigned long vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep;
+	int rc;
+
+	while (len) {
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+		if (ptep) {
+			rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+			gmap_pte_op_end(ptl);
+		}
+		if (rc) {
+			vmaddr = __gmap_translate(gmap, gaddr);
+			if (IS_ERR_VALUE(vmaddr))
+				return vmaddr;
+			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
+			if (rc)
+				return rc;
+			continue;
+		}
+		gaddr += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+	return 0;
+}
+
 /**
  * gmap_mprotect_notify - change access rights for a range of ptes and
  *                        call the notifier if any pte changes again
@@ -752,61 +901,1012 @@ static void gmap_pte_op_end(spinlock_t *ptl)
 int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
 			 unsigned long len, int prot)
 {
-	unsigned long vmaddr;
-	spinlock_t *ptl;
-	pte_t *ptep;
-	int rc = 0;
+	int rc;
 
-	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
 		return -EINVAL;
 	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
 		return -EINVAL;
 	down_read(&gmap->mm->mmap_sem);
-	while (len) {
+	rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+	up_read(&gmap->mm->mmap_sem);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ *                   absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+	unsigned long address, vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep, pte;
+	int rc;
+
+	while (1) {
 		rc = -EAGAIN;
 		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
 		if (ptep) {
-			rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot);
+			pte = *ptep;
+			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+				address = pte_val(pte) & PAGE_MASK;
+				address += gaddr & ~PAGE_MASK;
+				*val = *(unsigned long *) address;
+				pte_val(*ptep) |= _PAGE_YOUNG;
+				/* Do *NOT* clear the _PAGE_INVALID bit! */
+				rc = 0;
+			}
+			gmap_pte_op_end(ptl);
+		}
+		if (!rc)
+			break;
+		vmaddr = __gmap_translate(gmap, gaddr);
+		if (IS_ERR_VALUE(vmaddr)) {
+			rc = vmaddr;
+			break;
+		}
+		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+				    struct gmap_rmap *rmap)
+{
+	void **slot;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+	if (slot) {
+		rmap->next = radix_tree_deref_slot_protected(slot,
+							&sg->guest_table_lock);
+		radix_tree_replace_slot(slot, rmap);
+	} else {
+		rmap->next = NULL;
+		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+				  rmap);
+	}
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+			     unsigned long paddr, unsigned long len, int prot)
+{
+	struct gmap *parent;
+	struct gmap_rmap *rmap;
+	unsigned long vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	parent = sg->parent;
+	while (len) {
+		vmaddr = __gmap_translate(parent, paddr);
+		if (IS_ERR_VALUE(vmaddr))
+			return vmaddr;
+		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+		if (!rmap)
+			return -ENOMEM;
+		rmap->raddr = raddr;
+		rc = radix_tree_preload(GFP_KERNEL);
+		if (rc) {
+			kfree(rmap);
+			return rc;
+		}
+		rc = -EAGAIN;
+		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+		if (ptep) {
+			spin_lock(&sg->guest_table_lock);
+			rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+					     PGSTE_VSIE_BIT);
+			if (!rc)
+				gmap_insert_rmap(sg, vmaddr, rmap);
+			spin_unlock(&sg->guest_table_lock);
 			gmap_pte_op_end(ptl);
 		}
+		radix_tree_preload_end();
 		if (rc) {
-			vmaddr = __gmap_translate(gmap, gaddr);
-			if (IS_ERR_VALUE(vmaddr)) {
-				rc = vmaddr;
-				break;
-			}
-			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
+			kfree(rmap);
+			rc = gmap_pte_op_fixup(parent, paddr, vmaddr);
 			if (rc)
-				break;
+				return rc;
 			continue;
 		}
-		gaddr += PAGE_SIZE;
+		paddr += PAGE_SIZE;
 		len -= PAGE_SIZE;
 	}
-	up_read(&gmap->mm->mmap_sem);
+	return 0;
+}
+
+#define _SHADOW_RMAP_MASK	0x7
+#define _SHADOW_RMAP_REGION1	0x5
+#define _SHADOW_RMAP_REGION2	0x4
+#define _SHADOW_RMAP_REGION3	0x3
+#define _SHADOW_RMAP_SEGMENT	0x2
+#define _SHADOW_RMAP_PGTABLE	0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+	asm volatile(
+		"	.insn	rrf,0xb98e0000,%0,%1,0,0"
+		: : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+	if (!table || *table & _PAGE_INVALID)
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+				unsigned long *pgt)
+{
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	for (i = 0; i < 256; i++, raddr += 1UL << 12)
+		pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long sto, *ste, *pgt;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+	if (!ste || *ste & _SEGMENT_ENTRY_INVALID)
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+	sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+	pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+	*ste = _SEGMENT_ENTRY_EMPTY;
+	__gmap_unshadow_pgt(sg, raddr, pgt);
+	/* Free page table */
+	page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+				unsigned long *sgt)
+{
+	unsigned long asce, *pgt;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+		if (sgt[i] & _SEGMENT_ENTRY_INVALID)
+			continue;
+		pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+		sgt[i] = _SEGMENT_ENTRY_EMPTY;
+		__gmap_unshadow_pgt(sg, raddr, pgt);
+		/* Free page table */
+		page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		page_table_free_pgste(page);
+	}
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r3o, *r3e, *sgt;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+	if (!r3e || *r3e & _REGION_ENTRY_INVALID)
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+	r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+	gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+	sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+	*r3e = _REGION3_ENTRY_EMPTY;
+	__gmap_unshadow_sgt(sg, raddr, sgt);
+	/* Free segment table */
+	page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	__free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r3t)
+{
+	unsigned long asce, *sgt;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+		if (r3t[i] & _REGION_ENTRY_INVALID)
+			continue;
+		sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+		r3t[i] = _REGION3_ENTRY_EMPTY;
+		__gmap_unshadow_sgt(sg, raddr, sgt);
+		/* Free segment table */
+		page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		__free_pages(page, 2);
+	}
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r2o, *r2e, *r3t;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+	if (!r2e || *r2e & _REGION_ENTRY_INVALID)
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+	r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+	gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+	r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+	*r2e = _REGION2_ENTRY_EMPTY;
+	__gmap_unshadow_r3t(sg, raddr, r3t);
+	/* Free region 3 table */
+	page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	__free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r2t)
+{
+	unsigned long asce, *r3t;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+		if (r2t[i] & _REGION_ENTRY_INVALID)
+			continue;
+		r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+		r2t[i] = _REGION2_ENTRY_EMPTY;
+		__gmap_unshadow_r3t(sg, raddr, r3t);
+		/* Free region 3 table */
+		page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		__free_pages(page, 2);
+	}
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+	unsigned long r1o, *r1e, *r2t;
+	struct page *page;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+	if (!r1e || *r1e & _REGION_ENTRY_INVALID)
+		return;
+	gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+	r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+	gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+	r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+	*r1e = _REGION1_ENTRY_EMPTY;
+	__gmap_unshadow_r2t(sg, raddr, r2t);
+	/* Free region 2 table */
+	page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+	list_del(&page->lru);
+	__free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+				unsigned long *r1t)
+{
+	unsigned long asce, *r2t;
+	struct page *page;
+	int i;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+	for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+		if (r1t[i] & _REGION_ENTRY_INVALID)
+			continue;
+		r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+		__gmap_unshadow_r2t(sg, raddr, r2t);
+		/* Clear entry and flush translation r1t -> r2t */
+		gmap_idte_one(asce, raddr);
+		r1t[i] = _REGION1_ENTRY_EMPTY;
+		/* Free region 2 table */
+		page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+		list_del(&page->lru);
+		__free_pages(page, 2);
+	}
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	if (sg->removed)
+		return;
+	sg->removed = 1;
+	gmap_call_notifier(sg, 0, -1UL);
+	table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+	switch (sg->asce & _ASCE_TYPE_MASK) {
+	case _ASCE_TYPE_REGION1:
+		__gmap_unshadow_r1t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_REGION2:
+		__gmap_unshadow_r2t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_REGION3:
+		__gmap_unshadow_r3t(sg, 0, table);
+		break;
+	case _ASCE_TYPE_SEGMENT:
+		__gmap_unshadow_sgt(sg, 0, table);
+		break;
+	}
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
+{
+	struct gmap *sg;
+
+	list_for_each_entry(sg, &parent->children, list) {
+		if (sg->orig_asce != asce || sg->removed)
+			continue;
+		atomic_inc(&sg->ref_count);
+		return sg;
+	}
+	return NULL;
+}
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, NULL if out of memory or if
+ * anything goes wrong while protecting the top level pages.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce)
+{
+	struct gmap *sg, *new;
+	unsigned long limit;
+	int rc;
+
+	BUG_ON(gmap_is_shadow(parent));
+	spin_lock(&parent->shadow_lock);
+	sg = gmap_find_shadow(parent, asce);
+	spin_unlock(&parent->shadow_lock);
+	if (sg)
+		return sg;
+	/* Create a new shadow gmap */
+	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+	new = gmap_alloc(limit);
+	if (!new)
+		return NULL;
+	new->mm = parent->mm;
+	new->parent = gmap_get(parent);
+	new->orig_asce = asce;
+	down_read(&parent->mm->mmap_sem);
+	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+				((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+				PROT_READ, PGSTE_VSIE_BIT);
+	up_read(&parent->mm->mmap_sem);
+	if (rc) {
+		atomic_set(&new->ref_count, 2);
+		spin_lock(&parent->shadow_lock);
+		/* Recheck if another CPU created the same shadow */
+		sg = gmap_find_shadow(parent, asce);
+		if (!sg) {
+			list_add(&new->list, &parent->children);
+			sg = new;
+			new = NULL;
+		}
+		spin_unlock(&parent->shadow_lock);
+	}
+	if (new)
+		gmap_free(new);
+	return sg;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *s_r2t, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow region second table */
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		return -ENOMEM;
+	page->index = r2t & _REGION_ENTRY_ORIGIN;
+	s_r2t = (unsigned long *) page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	}
+	crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+	*table = (unsigned long) s_r2t |
+		_REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1;
+	list_add(&page->lru, &sg->crst_list);
+	spin_unlock(&sg->guest_table_lock);
+	/* Make r2t read-only in parent gmap page table */
+	raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+	origin = r2t & _REGION_ENTRY_ORIGIN;
+	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	if (rc) {
+		spin_lock(&sg->guest_table_lock);
+		gmap_unshadow_r2t(sg, raddr);
+		spin_unlock(&sg->guest_table_lock);
+	}
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, 2);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *s_r3t, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow region second table */
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		return -ENOMEM;
+	page->index = r3t & _REGION_ENTRY_ORIGIN;
+	s_r3t = (unsigned long *) page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	}
+	crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+	*table = (unsigned long) s_r3t |
+		_REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2;
+	list_add(&page->lru, &sg->crst_list);
+	spin_unlock(&sg->guest_table_lock);
+	/* Make r3t read-only in parent gmap page table */
+	raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+	origin = r3t & _REGION_ENTRY_ORIGIN;
+	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	if (rc) {
+		spin_lock(&sg->guest_table_lock);
+		gmap_unshadow_r3t(sg, raddr);
+		spin_unlock(&sg->guest_table_lock);
+	}
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, 2);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt)
+{
+	unsigned long raddr, origin, offset, len;
+	unsigned long *s_sgt, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow segment table */
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		return -ENOMEM;
+	page->index = sgt & _REGION_ENTRY_ORIGIN;
+	s_sgt = (unsigned long *) page_to_phys(page);
+	/* Install shadow region second table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _REGION_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	}
+	crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+	*table = (unsigned long) s_sgt |
+		_REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3;
+	list_add(&page->lru, &sg->crst_list);
+	spin_unlock(&sg->guest_table_lock);
+	/* Make sgt read-only in parent gmap page table */
+	raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+	origin = sgt & _REGION_ENTRY_ORIGIN;
+	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	if (rc) {
+		spin_lock(&sg->guest_table_lock);
+		gmap_unshadow_sgt(sg, raddr);
+		spin_unlock(&sg->guest_table_lock);
+	}
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	__free_pages(page, 2);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+			   unsigned long *pgt, int *dat_protection)
+{
+	unsigned long *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+		/* Shadow page tables are full pages (pte+pgste) */
+		page = pfn_to_page(*table >> PAGE_SHIFT);
+		*pgt = page->index;
+		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+		rc = 0;
+	} else  {
+		rc = -EAGAIN;
+	}
+	spin_unlock(&sg->guest_table_lock);
+	return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt)
+{
+	unsigned long raddr, origin;
+	unsigned long *s_pgt, *table;
+	struct page *page;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	/* Allocate a shadow page table */
+	page = page_table_alloc_pgste(sg->mm);
+	if (!page)
+		return -ENOMEM;
+	page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+	s_pgt = (unsigned long *) page_to_phys(page);
+	/* Install shadow page table */
+	spin_lock(&sg->guest_table_lock);
+	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+	if (!table) {
+		rc = -EAGAIN;		/* Race with unshadow */
+		goto out_free;
+	}
+	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+		rc = 0;			/* Already established */
+		goto out_free;
+	}
+	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+		 (pgt & _SEGMENT_ENTRY_PROTECT);
+	list_add(&page->lru, &sg->pt_list);
+	spin_unlock(&sg->guest_table_lock);
+	/* Make pgt read-only in parent gmap page table (not the pgste) */
+	raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+	if (rc) {
+		spin_lock(&sg->guest_table_lock);
+		gmap_unshadow_pgt(sg, raddr);
+		spin_unlock(&sg->guest_table_lock);
+	}
+	return rc;
+out_free:
+	spin_unlock(&sg->guest_table_lock);
+	page_table_free_pgste(page);
+	return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @paddr: parent gmap address to get mapped at @saddr
+ * @write: =1 map r/w, =0 map r/o
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
+		     unsigned long paddr, int write)
+{
+	struct gmap *parent;
+	struct gmap_rmap *rmap;
+	unsigned long vmaddr;
+	spinlock_t *ptl;
+	pte_t *sptep, *tptep;
+	int rc;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	parent = sg->parent;
+
+	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+	if (!rmap)
+		return -ENOMEM;
+	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+	while (1) {
+		vmaddr = __gmap_translate(parent, paddr);
+		if (IS_ERR_VALUE(vmaddr)) {
+			rc = vmaddr;
+			break;
+		}
+		rc = radix_tree_preload(GFP_KERNEL);
+		if (rc)
+			break;
+		rc = -EAGAIN;
+		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+		if (sptep) {
+			spin_lock(&sg->guest_table_lock);
+			/* Get page table pointer */
+			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+			if (!tptep) {
+				spin_unlock(&sg->guest_table_lock);
+				gmap_pte_op_end(ptl);
+				radix_tree_preload_end();
+				break;
+			}
+			rc = ptep_shadow_pte(sg->mm, saddr,
+					     sptep, tptep, write);
+			if (rc > 0) {
+				/* Success and a new mapping */
+				gmap_insert_rmap(sg, vmaddr, rmap);
+				rmap = NULL;
+				rc = 0;
+			}
+			gmap_pte_op_end(ptl);
+			spin_unlock(&sg->guest_table_lock);
+		}
+		radix_tree_preload_end();
+		if (!rc)
+			break;
+		rc = gmap_pte_op_fixup(parent, paddr, vmaddr);
+		if (rc)
+			break;
+	}
+	kfree(rmap);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+			       unsigned long offset, pte_t *pte)
+{
+	struct gmap_rmap *rmap, *rnext, *head;
+	unsigned long gaddr, start, end, bits, raddr;
+	unsigned long *table;
+
+	BUG_ON(!gmap_is_shadow(sg));
+	spin_lock(&sg->parent->guest_table_lock);
+	table = radix_tree_lookup(&sg->parent->host_to_guest,
+				  vmaddr >> PMD_SHIFT);
+	gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+	spin_unlock(&sg->parent->guest_table_lock);
+	if (!table)
+		return;
+
+	spin_lock(&sg->guest_table_lock);
+	if (sg->removed) {
+		spin_unlock(&sg->guest_table_lock);
+		return;
+	}
+	/* Check for top level table */
+	start = sg->orig_asce & _ASCE_ORIGIN;
+	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+	if (gaddr >= start && gaddr < end) {
+		/* The complete shadow table has to go */
+		gmap_unshadow(sg);
+		spin_unlock(&sg->guest_table_lock);
+		list_del(&sg->list);
+		gmap_put(sg);
+		return;
+	}
+	/* Remove the page table tree from on specific entry */
+	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+	gmap_for_each_rmap_safe(rmap, rnext, head) {
+		bits = rmap->raddr & _SHADOW_RMAP_MASK;
+		raddr = rmap->raddr ^ bits;
+		switch (bits) {
+		case _SHADOW_RMAP_REGION1:
+			gmap_unshadow_r2t(sg, raddr);
+			break;
+		case _SHADOW_RMAP_REGION2:
+			gmap_unshadow_r3t(sg, raddr);
+			break;
+		case _SHADOW_RMAP_REGION3:
+			gmap_unshadow_sgt(sg, raddr);
+			break;
+		case _SHADOW_RMAP_SEGMENT:
+			gmap_unshadow_pgt(sg, raddr);
+			break;
+		case _SHADOW_RMAP_PGTABLE:
+			gmap_unshadow_page(sg, raddr);
+			break;
+		}
+		kfree(rmap);
+	}
+	spin_unlock(&sg->guest_table_lock);
+}
 
 /**
  * ptep_notify - call all invalidation callbacks for a specific pte.
  * @mm: pointer to the process mm_struct
  * @addr: virtual address in the process address space
  * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
  *
  * This function is assumed to be called with the page table lock held
  * for the pte to notify.
  */
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+		 pte_t *pte, unsigned long bits)
 {
 	unsigned long offset, gaddr;
 	unsigned long *table;
-	struct gmap *gmap;
+	struct gmap *gmap, *sg, *next;
 
 	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
 	offset = offset * (4096 / sizeof(pte_t));
 	rcu_read_lock();
 	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+			spin_lock(&gmap->shadow_lock);
+			list_for_each_entry_safe(sg, next,
+						 &gmap->children, list)
+				gmap_shadow_notify(sg, vmaddr, offset, pte);
+			spin_unlock(&gmap->shadow_lock);
+		}
+		if (!(bits & PGSTE_IN_BIT))
+			continue;
 		spin_lock(&gmap->guest_table_lock);
 		table = radix_tree_lookup(&gmap->host_to_guest,
 					  vmaddr >> PMD_SHIFT);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 7be1f94f70a8ce..9c57a295a045b9 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 	return new;
 }
 
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+	struct page *page;
+	unsigned long *table;
+
+	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+	if (page) {
+		table = (unsigned long *) page_to_phys(page);
+		clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+		clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+	}
+	return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+	__free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
 /*
  * page table entry allocation/free routines.
  */
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab65fb11e05863..5b02583fbf4cbc 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -184,9 +184,12 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
 				       pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-	if (pgste_val(pgste) & PGSTE_IN_BIT) {
-		pgste_val(pgste) &= ~PGSTE_IN_BIT;
-		ptep_notify(mm, addr, ptep);
+	unsigned long bits;
+
+	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+	if (bits) {
+		pgste_val(pgste) ^= bits;
+		ptep_notify(mm, addr, ptep, bits);
 	}
 #endif
 	return pgste;
@@ -420,12 +423,13 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
  * @addr: virtual address in the guest address space
  * @ptep: pointer to the page table entry
  * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
  *
  * Returns 0 if the access rights were changed and -EAGAIN if the current
  * and requested access rights are incompatible.
  */
 int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
-		    pte_t *ptep, int prot)
+		    pte_t *ptep, int prot, unsigned long bit)
 {
 	pte_t entry;
 	pgste_t pgste;
@@ -441,7 +445,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
 		pgste_set_unlock(ptep, pgste);
 		return -EAGAIN;
 	}
-	/* Change access rights and set the pgste notification bit */
+	/* Change access rights and set pgste bit */
 	if (prot == PROT_NONE && !pte_i) {
 		ptep_flush_direct(mm, addr, ptep);
 		pgste = pgste_update_all(entry, pgste, mm);
@@ -452,12 +456,53 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
 		pte_val(entry) &= ~_PAGE_INVALID;
 		pte_val(entry) |= _PAGE_PROTECT;
 	}
-	pgste_val(pgste) |= PGSTE_IN_BIT;
+	pgste_val(pgste) |= bit;
 	pgste = pgste_set_pte(ptep, pgste, entry);
 	pgste_set_unlock(ptep, pgste);
 	return 0;
 }
 
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+		    pte_t *sptep, pte_t *tptep, int write)
+{
+	pgste_t spgste, tpgste;
+	pte_t spte, tpte;
+	int rc = -EAGAIN;
+
+	spgste = pgste_get_lock(sptep);
+	spte = *sptep;
+	if (!(pte_val(spte) & _PAGE_INVALID) &&
+	    !(pte_val(spte) & _PAGE_PROTECT)) {
+		rc = 0;
+		if (!(pte_val(*tptep) & _PAGE_INVALID))
+			/* Update existing mapping */
+			ptep_flush_direct(mm, saddr, tptep);
+		else
+			rc = 1;
+		pgste_val(spgste) |= PGSTE_VSIE_BIT;
+		tpgste = pgste_get_lock(tptep);
+		pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+			(write ? 0 : _PAGE_PROTECT);
+		/* don't touch the storage key - it belongs to parent pgste */
+		tpgste = pgste_set_pte(tptep, tpgste, tpte);
+		pgste_set_unlock(tptep, tpgste);
+	}
+	pgste_set_unlock(sptep, spgste);
+	return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+	pgste_t pgste;
+
+	pgste = pgste_get_lock(ptep);
+	/* notifier is called by the caller */
+	ptep_flush_direct(mm, saddr, ptep);
+	/* don't touch the storage key - it belongs to parent pgste */
+	pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+	pgste_set_unlock(ptep, pgste);
+}
+
 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 {
 	if (!non_swap_entry(entry))

From aa17aa57cfb95b169f25fe98caae49e477590af3 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 12:16:35 +0100
Subject: [PATCH 121/302] s390/mm: add kvm shadow fault function

This patch introduces function kvm_s390_shadow_fault() used to resolve a
fault on a shadow gmap. This function will do validity checking and
build up the shadow page table hierarchy in order to fault in the
requested page into the shadow page table structure.

If an exception occurs while shadowing, guest 2 has to be notified about
it using either an exception or a program interrupt intercept. If
concurrent unshadowing occurres, this function will simply return with
-EAGAIN and the caller has to retry.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 168 ++++++++++++++++++++++++++++++++++++++++
 arch/s390/kvm/gaccess.h |   2 +
 2 files changed, 170 insertions(+)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8e245e764c210a..ba4985262bced3 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <linux/err.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include <asm/switch_to.h>
@@ -946,3 +947,170 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 		return 0;
 	return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
 }
+
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+				  unsigned long *pgt, int *dat_protection)
+{
+	struct gmap *parent;
+	union asce asce;
+	union vaddress vaddr;
+	unsigned long ptr;
+	int rc;
+
+	parent = sg->parent;
+	vaddr.addr = saddr;
+	asce.val = sg->orig_asce;
+	ptr = asce.origin * 4096;
+	switch (asce.dt) {
+	case ASCE_TYPE_REGION1:
+		if (vaddr.rfx01 > asce.tl)
+			return PGM_REGION_FIRST_TRANS;
+		break;
+	case ASCE_TYPE_REGION2:
+		if (vaddr.rfx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.rsx01 > asce.tl)
+			return PGM_REGION_SECOND_TRANS;
+		break;
+	case ASCE_TYPE_REGION3:
+		if (vaddr.rfx || vaddr.rsx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.rtx01 > asce.tl)
+			return PGM_REGION_THIRD_TRANS;
+		break;
+	case ASCE_TYPE_SEGMENT:
+		if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.sx01 > asce.tl)
+			return PGM_SEGMENT_TRANSLATION;
+		break;
+	}
+
+	switch (asce.dt) {
+	case ASCE_TYPE_REGION1: {
+		union region1_table_entry rfte;
+
+		rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+		if (rc)
+			return rc;
+		if (rfte.i)
+			return PGM_REGION_FIRST_TRANS;
+		if (rfte.tt != TABLE_TYPE_REGION1)
+			return PGM_TRANSLATION_SPEC;
+		if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+			return PGM_REGION_SECOND_TRANS;
+		rc = gmap_shadow_r2t(sg, saddr, rfte.val);
+		if (rc)
+			return rc;
+		ptr = rfte.rto * 4096;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_REGION2: {
+		union region2_table_entry rste;
+
+		rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+		if (rc)
+			return rc;
+		if (rste.i)
+			return PGM_REGION_SECOND_TRANS;
+		if (rste.tt != TABLE_TYPE_REGION2)
+			return PGM_TRANSLATION_SPEC;
+		if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+			return PGM_REGION_THIRD_TRANS;
+		rc = gmap_shadow_r3t(sg, saddr, rste.val);
+		if (rc)
+			return rc;
+		ptr = rste.rto * 4096;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_REGION3: {
+		union region3_table_entry rtte;
+
+		rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+		if (rc)
+			return rc;
+		if (rtte.i)
+			return PGM_REGION_THIRD_TRANS;
+		if (rtte.tt != TABLE_TYPE_REGION3)
+			return PGM_TRANSLATION_SPEC;
+		if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+			return PGM_SEGMENT_TRANSLATION;
+		rc = gmap_shadow_sgt(sg, saddr, rtte.val);
+		if (rc)
+			return rc;
+		ptr = rtte.fc0.sto * 4096;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_SEGMENT: {
+		union segment_table_entry ste;
+
+		rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+		if (rc)
+			return rc;
+		if (ste.i)
+			return PGM_SEGMENT_TRANSLATION;
+		if (ste.tt != TABLE_TYPE_SEGMENT)
+			return PGM_TRANSLATION_SPEC;
+		if (ste.cs && asce.p)
+			return PGM_TRANSLATION_SPEC;
+		*dat_protection = ste.fc0.p;
+		rc = gmap_shadow_pgt(sg, saddr, ste.val);
+		if (rc)
+			return rc;
+		ptr = ste.fc0.pto * 2048;
+	}
+	}
+	/* Return the parent address of the page table */
+	*pgt = ptr;
+	return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @write: =1 map r/w, =0 map r/o
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ *	    - > 0 (pgm exception code) on exceptions while faulting
+ *	    - -EAGAIN if the caller can retry immediately
+ *	    - -EFAULT when accessing invalid guest addresses
+ *	    - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write)
+{
+	union vaddress vaddr;
+	union page_table_entry pte;
+	unsigned long pgt;
+	int dat_protection;
+	int rc;
+
+	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection);
+	if (rc) {
+		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection);
+		if (rc)
+			return rc;
+	}
+
+	vaddr.addr = saddr;
+	rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+	if (rc)
+		return rc;
+	if (pte.i)
+		return PGM_PAGE_TRANSLATION;
+	if (pte.z || pte.co)
+		return PGM_TRANSLATION_SPEC;
+	dat_protection |= pte.p;
+	if (write && dat_protection)
+		return PGM_PROTECTION;
+	rc = gmap_shadow_page(sg, saddr, pte.pfra * 4096, write);
+	if (rc)
+		return rc;
+	return 0;
+}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index df0a79dd81595f..e5ec4734d42da1 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,6 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr, int write);
+
 #endif /* __KVM_S390_GACCESS_H */

From eea3678d4334925bf838e6f4bc88760811a84cd6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 15 Apr 2016 12:45:45 +0200
Subject: [PATCH 122/302] s390/mm: flush tlb of shadows in all situations

For now, the tlb of shadow gmap is only flushed when the parent is removed,
not when it is removed upfront. Therefore other shadow gmaps can reuse the
tables without the tlb getting flushed.

Fix this by simply flushing the tlb
1. Before the shadow tables are removed (analogouos to other unshadow functions)
2. When the gmap is freed and therefore the top level pages are freed.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 6695a09a3885c5..b02d0d0cc64136 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -173,6 +173,9 @@ static void gmap_free(struct gmap *gmap)
 {
 	struct page *page, *next;
 
+	/* Flush tlb of all gmaps (if not already done for shadows) */
+	if (!(gmap_is_shadow(gmap) && gmap->removed))
+		gmap_flush_tlb(gmap);
 	/* Free all segment & region tables. */
 	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 		__free_pages(page, 2);
@@ -226,13 +229,10 @@ void gmap_remove(struct gmap *gmap)
 {
 	struct gmap *sg, *next;
 
-	/* Flush tlb. */
-	gmap_flush_tlb(gmap);
 	/* Remove all shadow gmaps linked to this gmap */
 	if (!list_empty(&gmap->children)) {
 		spin_lock(&gmap->shadow_lock);
 		list_for_each_entry_safe(sg, next, &gmap->children, list) {
-			gmap_flush_tlb(sg);
 			list_del(&sg->list);
 			gmap_put(sg);
 		}
@@ -1360,6 +1360,7 @@ static void gmap_unshadow(struct gmap *sg)
 		return;
 	sg->removed = 1;
 	gmap_call_notifier(sg, 0, -1UL);
+	gmap_flush_tlb(sg);
 	table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
 	switch (sg->asce & _ASCE_TYPE_MASK) {
 	case _ASCE_TYPE_REGION1:

From a9d23e71d7716e394a772686bfd994f4e181b235 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 12:21:41 +0100
Subject: [PATCH 123/302] s390/mm: shadow pages with real guest requested
 protection

We really want to avoid manually handling protection for nested
virtualization. By shadowing pages with the protection the guest asked us
for, the SIE can handle most protection-related actions for us (e.g.
special handling for MVPG) and we can directly forward protection
exceptions to the guest.

PTEs will now always be shadowed with the correct _PAGE_PROTECT flag.
Unshadowing will take care of any guest changes to the parent PTE and
any host changes to the host PTE. If the host PTE doesn't have the
fitting access rights or is not available, we have to fix it up.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h    |  3 +--
 arch/s390/include/asm/pgtable.h |  2 +-
 arch/s390/kvm/gaccess.c         |  2 +-
 arch/s390/mm/gmap.c             | 12 +++++-------
 arch/s390/mm/pgtable.c          | 16 +++++++---------
 5 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 58e65ee5b2d25e..4a47055f58d766 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -110,8 +110,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt);
 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt);
 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
 			   unsigned long *pgt, int *dat_protection);
-int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
-		     unsigned long paddr, int write);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
 
 void gmap_register_pte_notifier(struct gmap_notifier *);
 void gmap_unregister_pte_notifier(struct gmap_notifier *);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index a6e7fc8f5b495c..c7ebba483f097f 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -895,7 +895,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep , int reset);
 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
-		    pte_t *sptep, pte_t *tptep, int write);
+		    pte_t *sptep, pte_t *tptep, pte_t pte);
 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
 
 bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index ba4985262bced3..c5f79c1205cf8f 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1109,7 +1109,7 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write)
 	dat_protection |= pte.p;
 	if (write && dat_protection)
 		return PGM_PROTECTION;
-	rc = gmap_shadow_page(sg, saddr, pte.pfra * 4096, write);
+	rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
 	if (rc)
 		return rc;
 	return 0;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index b02d0d0cc64136..a57a87bfeb2784 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1743,8 +1743,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
  * gmap_shadow_page - create a shadow page mapping
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
- * @paddr: parent gmap address to get mapped at @saddr
- * @write: =1 map r/w, =0 map r/o
+ * @pte: pte in parent gmap address space to get shadowed
  *
  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  * shadow table structure is incomplete, -ENOMEM if out of memory and
@@ -1752,12 +1751,11 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
  *
  * Called with sg->mm->mmap_sem in read.
  */
-int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
-		     unsigned long paddr, int write)
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 {
 	struct gmap *parent;
 	struct gmap_rmap *rmap;
-	unsigned long vmaddr;
+	unsigned long vmaddr, paddr;
 	spinlock_t *ptl;
 	pte_t *sptep, *tptep;
 	int rc;
@@ -1771,6 +1769,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
 	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
 
 	while (1) {
+		paddr = pte_val(pte) & PAGE_MASK;
 		vmaddr = __gmap_translate(parent, paddr);
 		if (IS_ERR_VALUE(vmaddr)) {
 			rc = vmaddr;
@@ -1791,8 +1790,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
 				radix_tree_preload_end();
 				break;
 			}
-			rc = ptep_shadow_pte(sg->mm, saddr,
-					     sptep, tptep, write);
+			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
 			if (rc > 0) {
 				/* Success and a new mapping */
 				gmap_insert_rmap(sg, vmaddr, rmap);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5b02583fbf4cbc..293130b5aee763 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -463,29 +463,27 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
 }
 
 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
-		    pte_t *sptep, pte_t *tptep, int write)
+		    pte_t *sptep, pte_t *tptep, pte_t pte)
 {
 	pgste_t spgste, tpgste;
 	pte_t spte, tpte;
 	int rc = -EAGAIN;
 
+	if (!(pte_val(*tptep) & _PAGE_INVALID))
+		return 0;	/* already shadowed */
 	spgste = pgste_get_lock(sptep);
 	spte = *sptep;
 	if (!(pte_val(spte) & _PAGE_INVALID) &&
-	    !(pte_val(spte) & _PAGE_PROTECT)) {
-		rc = 0;
-		if (!(pte_val(*tptep) & _PAGE_INVALID))
-			/* Update existing mapping */
-			ptep_flush_direct(mm, saddr, tptep);
-		else
-			rc = 1;
+	    !((pte_val(spte) & _PAGE_PROTECT) &&
+	      !(pte_val(pte) & _PAGE_PROTECT))) {
 		pgste_val(spgste) |= PGSTE_VSIE_BIT;
 		tpgste = pgste_get_lock(tptep);
 		pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
-			(write ? 0 : _PAGE_PROTECT);
+				(pte_val(pte) & _PAGE_PROTECT);
 		/* don't touch the storage key - it belongs to parent pgste */
 		tpgste = pgste_set_pte(tptep, tpgste, tpte);
 		pgste_set_unlock(tptep, tpgste);
+		rc = 1;
 	}
 	pgste_set_unlock(sptep, spgste);
 	return rc;

From 998f637cc4b9ef3fa32b196294a3136ee05271a2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 12:23:38 +0100
Subject: [PATCH 124/302] s390/mm: avoid races on region/segment/page table
 shadowing

We have to unlock sg->guest_table_lock in order to call
gmap_protect_rmap(). If we sleep just before that call, another VCPU
might pick up that shadowed page table (while it is not protected yet)
and use it.

In order to avoid these races, we have to introduce a third state -
"origin set but still invalid" for an entry. This way, we can avoid
another thread already using the entry before the table is fully protected.
As soon as everything is set up, we can clear the invalid bit - if we
had no race with the unshadowing code.

Suggested-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 97 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 70 insertions(+), 27 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index a57a87bfeb2784..a396e58b5a4358 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1125,7 +1125,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
 
 	BUG_ON(!gmap_is_shadow(sg));
 	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
-	if (!ste || *ste & _SEGMENT_ENTRY_INVALID)
+	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
 		return;
 	gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
 	sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
@@ -1157,7 +1157,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
 	BUG_ON(!gmap_is_shadow(sg));
 	asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
 	for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
-		if (sgt[i] & _SEGMENT_ENTRY_INVALID)
+		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
 			continue;
 		pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
 		sgt[i] = _SEGMENT_ENTRY_EMPTY;
@@ -1183,7 +1183,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
 
 	BUG_ON(!gmap_is_shadow(sg));
 	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
-	if (!r3e || *r3e & _REGION_ENTRY_INVALID)
+	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
 		return;
 	gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
 	r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
@@ -1215,7 +1215,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
 	BUG_ON(!gmap_is_shadow(sg));
 	asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
 	for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
-		if (r3t[i] & _REGION_ENTRY_INVALID)
+		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
 		sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
 		r3t[i] = _REGION3_ENTRY_EMPTY;
@@ -1241,7 +1241,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
 
 	BUG_ON(!gmap_is_shadow(sg));
 	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
-	if (!r2e || *r2e & _REGION_ENTRY_INVALID)
+	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
 		return;
 	gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
 	r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
@@ -1273,7 +1273,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
 	BUG_ON(!gmap_is_shadow(sg));
 	asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
 	for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
-		if (r2t[i] & _REGION_ENTRY_INVALID)
+		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
 		r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
 		r2t[i] = _REGION2_ENTRY_EMPTY;
@@ -1299,7 +1299,7 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
 
 	BUG_ON(!gmap_is_shadow(sg));
 	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
-	if (!r1e || *r1e & _REGION_ENTRY_INVALID)
+	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
 		return;
 	gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
 	r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
@@ -1331,7 +1331,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
 	BUG_ON(!gmap_is_shadow(sg));
 	asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
 	for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
-		if (r1t[i] & _REGION_ENTRY_INVALID)
+		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
 			continue;
 		r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
 		__gmap_unshadow_r2t(sg, raddr, r2t);
@@ -1496,10 +1496,14 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
 	if (!(*table & _REGION_ENTRY_INVALID)) {
 		rc = 0;			/* Already established */
 		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
 	}
 	crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
-	*table = (unsigned long) s_r2t |
-		_REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1;
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
 	list_add(&page->lru, &sg->crst_list);
 	spin_unlock(&sg->guest_table_lock);
 	/* Make r2t read-only in parent gmap page table */
@@ -1508,11 +1512,18 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
 	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
 	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
-	if (rc) {
-		spin_lock(&sg->guest_table_lock);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 4);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+			      (unsigned long) s_r2t)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
 		gmap_unshadow_r2t(sg, raddr);
-		spin_unlock(&sg->guest_table_lock);
 	}
+	spin_unlock(&sg->guest_table_lock);
 	return rc;
 out_free:
 	spin_unlock(&sg->guest_table_lock);
@@ -1557,10 +1568,13 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
 	if (!(*table & _REGION_ENTRY_INVALID)) {
 		rc = 0;			/* Already established */
 		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
 	}
 	crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
-	*table = (unsigned long) s_r3t |
-		_REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2;
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
 	list_add(&page->lru, &sg->crst_list);
 	spin_unlock(&sg->guest_table_lock);
 	/* Make r3t read-only in parent gmap page table */
@@ -1569,11 +1583,18 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
 	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
 	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
-	if (rc) {
-		spin_lock(&sg->guest_table_lock);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 3);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+			      (unsigned long) s_r3t)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
 		gmap_unshadow_r3t(sg, raddr);
-		spin_unlock(&sg->guest_table_lock);
 	}
+	spin_unlock(&sg->guest_table_lock);
 	return rc;
 out_free:
 	spin_unlock(&sg->guest_table_lock);
@@ -1618,10 +1639,14 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt)
 	if (!(*table & _REGION_ENTRY_INVALID)) {
 		rc = 0;			/* Already established */
 		goto out_free;
+	} else if (*table & _REGION_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
 	}
 	crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
-	*table = (unsigned long) s_sgt |
-		_REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3;
+	/* mark as invalid as long as the parent table is not protected */
+	*table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
 	list_add(&page->lru, &sg->crst_list);
 	spin_unlock(&sg->guest_table_lock);
 	/* Make sgt read-only in parent gmap page table */
@@ -1630,11 +1655,18 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt)
 	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
 	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
 	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
-	if (rc) {
-		spin_lock(&sg->guest_table_lock);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 2);
+		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+			      (unsigned long) s_sgt)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_REGION_ENTRY_INVALID;
+	} else {
 		gmap_unshadow_sgt(sg, raddr);
-		spin_unlock(&sg->guest_table_lock);
 	}
+	spin_unlock(&sg->guest_table_lock);
 	return rc;
 out_free:
 	spin_unlock(&sg->guest_table_lock);
@@ -1716,20 +1748,31 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt)
 	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
 		rc = 0;			/* Already established */
 		goto out_free;
+	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+		rc = -EAGAIN;		/* Race with shadow */
+		goto out_free;
 	}
+	/* mark as invalid as long as the parent table is not protected */
 	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
-		 (pgt & _SEGMENT_ENTRY_PROTECT);
+		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
 	list_add(&page->lru, &sg->pt_list);
 	spin_unlock(&sg->guest_table_lock);
 	/* Make pgt read-only in parent gmap page table (not the pgste) */
 	raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
 	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
 	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
-	if (rc) {
-		spin_lock(&sg->guest_table_lock);
+	spin_lock(&sg->guest_table_lock);
+	if (!rc) {
+		table = gmap_table_walk(sg, saddr, 1);
+		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+			      (unsigned long) s_pgt)
+			rc = -EAGAIN;		/* Race with unshadow */
+		else
+			*table &= ~_SEGMENT_ENTRY_INVALID;
+	} else {
 		gmap_unshadow_pgt(sg, raddr);
-		spin_unlock(&sg->guest_table_lock);
 	}
+	spin_unlock(&sg->guest_table_lock);
 	return rc;
 out_free:
 	spin_unlock(&sg->guest_table_lock);

From 0f7f84891516dc1ff7500fae12143710d2d9d11f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 12:30:46 +0100
Subject: [PATCH 125/302] s390/mm: fix races on gmap_shadow creation

Before any thread is allowed to use a gmap_shadow, it has to be fully
initialized. However, for invalidation to work properly, we have to
register the new gmap_shadow before we protect the parent gmap table.

Because locking is tricky, and we have to avoid duplicate gmaps, let's
introduce an initialized field, that signalizes other threads if that
gmap_shadow can already be used or if they have to retry.

Let's properly return errors using ERR_PTR() instead of simply returning
NULL, so a caller can properly react on the error.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  2 ++
 arch/s390/mm/gmap.c          | 45 ++++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 4a47055f58d766..54a2487efce4f7 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -27,6 +27,7 @@
  * @parent: pointer to the parent gmap for shadow guest address spaces
  * @orig_asce: ASCE for which the shadow page table has been created
  * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
  */
 struct gmap {
 	struct list_head list;
@@ -49,6 +50,7 @@ struct gmap {
 	struct gmap *parent;
 	unsigned long orig_asce;
 	bool removed;
+	bool initialized;
 };
 
 /**
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index a396e58b5a4358..a7dfb337e13380 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1384,7 +1384,8 @@ static void gmap_unshadow(struct gmap *sg)
  * @asce: ASCE for which the shadow table is created
  *
  * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, otherwise NULL
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
  */
 static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
 {
@@ -1393,6 +1394,8 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
 	list_for_each_entry(sg, &parent->children, list) {
 		if (sg->orig_asce != asce || sg->removed)
 			continue;
+		if (!sg->initialized)
+			return ERR_PTR(-EAGAIN);
 		atomic_inc(&sg->ref_count);
 		return sg;
 	}
@@ -1409,8 +1412,9 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
  * The shadow table will be removed automatically on any change to the
  * PTE mapping for the source table.
  *
- * Returns a guest address space structure, NULL if out of memory or if
- * anything goes wrong while protecting the top level pages.
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
  */
 struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce)
 {
@@ -1428,30 +1432,37 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce)
 	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
 	new = gmap_alloc(limit);
 	if (!new)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	new->mm = parent->mm;
 	new->parent = gmap_get(parent);
 	new->orig_asce = asce;
+	new->initialized = false;
+	spin_lock(&parent->shadow_lock);
+	/* Recheck if another CPU created the same shadow */
+	sg = gmap_find_shadow(parent, asce);
+	if (sg) {
+		spin_unlock(&parent->shadow_lock);
+		gmap_free(new);
+		return sg;
+	}
+	atomic_set(&new->ref_count, 2);
+	list_add(&new->list, &parent->children);
+	spin_unlock(&parent->shadow_lock);
+	/* protect after insertion, so it will get properly invalidated */
 	down_read(&parent->mm->mmap_sem);
 	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
 				((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
 				PROT_READ, PGSTE_VSIE_BIT);
 	up_read(&parent->mm->mmap_sem);
+	spin_lock(&parent->shadow_lock);
+	new->initialized = true;
 	if (rc) {
-		atomic_set(&new->ref_count, 2);
-		spin_lock(&parent->shadow_lock);
-		/* Recheck if another CPU created the same shadow */
-		sg = gmap_find_shadow(parent, asce);
-		if (!sg) {
-			list_add(&new->list, &parent->children);
-			sg = new;
-			new = NULL;
-		}
-		spin_unlock(&parent->shadow_lock);
-	}
-	if (new)
+		list_del(&new->list);
 		gmap_free(new);
-	return sg;
+		new = ERR_PTR(rc);
+	}
+	spin_unlock(&parent->shadow_lock);
+	return new;
 }
 EXPORT_SYMBOL_GPL(gmap_shadow);
 

From e52f8b6112353e9e8eac64f082bfbc65e64bb2dd Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 2 Feb 2016 12:26:00 +0100
Subject: [PATCH 126/302] s390/mm: take the mmap_sem in kvm_s390_shadow_fault()

Instead of doing it in the caller, let's just take the mmap_sem
in kvm_s390_shadow_fault(). By taking it as read, we allow parallel
faulting on shadow page tables, gmap shadow code is prepared for that.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index c5f79c1205cf8f..5b5eee2d51cd59 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1091,26 +1091,24 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write)
 	int dat_protection;
 	int rc;
 
+	down_read(&sg->mm->mmap_sem);
+
 	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection);
-	if (rc) {
+	if (rc)
 		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection);
-		if (rc)
-			return rc;
-	}
 
 	vaddr.addr = saddr;
-	rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
-	if (rc)
-		return rc;
-	if (pte.i)
-		return PGM_PAGE_TRANSLATION;
-	if (pte.z || pte.co)
-		return PGM_TRANSLATION_SPEC;
+	if (!rc)
+		rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+	if (!rc && pte.i)
+		rc = PGM_PAGE_TRANSLATION;
+	if (!rc && (pte.z || pte.co))
+		rc = PGM_TRANSLATION_SPEC;
 	dat_protection |= pte.p;
-	if (write && dat_protection)
-		return PGM_PROTECTION;
-	rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
-	if (rc)
-		return rc;
-	return 0;
+	if (!rc && write && dat_protection)
+		rc = PGM_PROTECTION;
+	if (!rc)
+		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+	up_read(&sg->mm->mmap_sem);
+	return rc;
 }

From 7a6741576b268820c8bd2b66288e6ff3bc57d4a7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 27 Jan 2016 17:18:41 +0100
Subject: [PATCH 127/302] s390/mm: protection exceptions are corrrectly
 shadowed

As gmap shadows contains correct protection permissions, protection
exceptons can directly be forwarded to guest 3. If we would encounter
a protection exception while faulting, the next guest 3 run will
automatically handle that for us.

Keep the dat_protection logic in place, as it will be helpful later.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 6 +-----
 arch/s390/kvm/gaccess.h | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 5b5eee2d51cd59..b2783dd71854b5 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1075,7 +1075,6 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
  * kvm_s390_shadow_fault - handle fault on a shadow page table
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
- * @write: =1 map r/w, =0 map r/o
  *
  * Returns: - 0 if the shadow fault was successfully resolved
  *	    - > 0 (pgm exception code) on exceptions while faulting
@@ -1083,7 +1082,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
  *	    - -EFAULT when accessing invalid guest addresses
  *	    - -ENOMEM if out of memory
  */
-int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write)
+int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr)
 {
 	union vaddress vaddr;
 	union page_table_entry pte;
@@ -1104,9 +1103,6 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write)
 		rc = PGM_PAGE_TRANSLATION;
 	if (!rc && (pte.z || pte.co))
 		rc = PGM_TRANSLATION_SPEC;
-	dat_protection |= pte.p;
-	if (!rc && write && dat_protection)
-		rc = PGM_PROTECTION;
 	if (!rc)
 		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
 	up_read(&sg->mm->mmap_sem);
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index e5ec4734d42da1..0d044d09dbd80a 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,6 +361,6 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
-int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr, int write);
+int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr);
 
 #endif /* __KVM_S390_GACCESS_H */

From f4debb40903978bbddfb9e877ca4d2f27e26567f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 27 Jan 2016 17:24:03 +0100
Subject: [PATCH 128/302] s390/mm: take ipte_lock during shadow faults

Let's take the ipte_lock while working on guest 2 provided page table, just
like the other gaccess functions.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 11 ++++++++++-
 arch/s390/kvm/gaccess.h |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index b2783dd71854b5..e70f916c107961 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1073,6 +1073,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 
 /**
  * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
  *
@@ -1082,7 +1083,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
  *	    - -EFAULT when accessing invalid guest addresses
  *	    - -ENOMEM if out of memory
  */
-int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr)
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+			  unsigned long saddr)
 {
 	union vaddress vaddr;
 	union page_table_entry pte;
@@ -1091,6 +1093,12 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr)
 	int rc;
 
 	down_read(&sg->mm->mmap_sem);
+	/*
+	 * We don't want any guest-2 tables to change - so the parent
+	 * tables/pointers we read stay valid - unshadowing is however
+	 * always possible - only guest_table_lock protects us.
+	 */
+	ipte_lock(vcpu);
 
 	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection);
 	if (rc)
@@ -1105,6 +1113,7 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr)
 		rc = PGM_TRANSLATION_SPEC;
 	if (!rc)
 		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+	ipte_unlock(vcpu);
 	up_read(&sg->mm->mmap_sem);
 	return rc;
 }
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 0d044d09dbd80a..8756569ad9381f 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,6 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
-int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr);
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+			  unsigned long saddr);
 
 #endif /* __KVM_S390_GACCESS_H */

From 00fc062d5364174b94e3b5780c22e95c0fb4b60a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 18 Apr 2016 17:19:59 +0200
Subject: [PATCH 129/302] s390/mm: push ste protection down to shadow pte

If a guest ste is read-only, it doesn't make sense to force the ptes in as
writable in the host. If the source page is read-only in the host, it won't
have to be made writable. Please note that if the source page is not
available, it will still be faulted in writable. This can be changed
internally later on.

If ste protection is removed, underlying shadow tables are also removed,
therefore this change does not affect the guest.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index e70f916c107961..a85bc6c6a09834 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1111,6 +1111,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
 		rc = PGM_PAGE_TRANSLATION;
 	if (!rc && (pte.z || pte.co))
 		rc = PGM_TRANSLATION_SPEC;
+	pte.p |= dat_protection;
 	if (!rc)
 		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
 	ipte_unlock(vcpu);

From 5b062bd4940f81e0bd26b0d75f56d7abebf0309f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 12:17:40 +0100
Subject: [PATCH 130/302] s390/mm: prepare for EDAT1/EDAT2 support in gmap
 shadow

In preparation for EDAT1/EDAT2 support for gmap shadows, we have to store
the requested edat level in the gmap shadow.

The edat level used during shadow translation is a property of the gmap
shadow. Depending on that level, the gmap shadow will look differently for
the same guest tables. We have to store it internally in order to support
it later.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  5 ++++-
 arch/s390/mm/gmap.c          | 16 +++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 54a2487efce4f7..2ab397c8ca092b 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -26,6 +26,7 @@
  * @shadow_lock: spinlock to protect the shadow gmap list
  * @parent: pointer to the parent gmap for shadow guest address spaces
  * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
  * @removed: flag to indicate if a shadow guest address space has been removed
  * @initialized: flag to indicate if a shadow guest address space can be used
  */
@@ -49,6 +50,7 @@ struct gmap {
 	spinlock_t shadow_lock;
 	struct gmap *parent;
 	unsigned long orig_asce;
+	int edat_level;
 	bool removed;
 	bool initialized;
 };
@@ -105,7 +107,8 @@ void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr)
 
 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
 
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce);
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+			 int edat_level);
 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t);
 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t);
 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index a7dfb337e13380..f0b2a531c5999b 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1382,17 +1382,20 @@ static void gmap_unshadow(struct gmap *sg)
  * gmap_find_shadow - find a specific asce in the list of shadow tables
  * @parent: pointer to the parent gmap
  * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
  *
  * Returns the pointer to a gmap if a shadow table with the given asce is
  * already available, ERR_PTR(-EAGAIN) if another one is just being created,
  * otherwise NULL
  */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+				     int edat_level)
 {
 	struct gmap *sg;
 
 	list_for_each_entry(sg, &parent->children, list) {
-		if (sg->orig_asce != asce || sg->removed)
+		if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+		    sg->removed)
 			continue;
 		if (!sg->initialized)
 			return ERR_PTR(-EAGAIN);
@@ -1406,6 +1409,7 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
  * gmap_shadow - create/find a shadow guest address space
  * @parent: pointer to the parent gmap
  * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
  *
  * The pages of the top level page table referred by the asce parameter
  * will be set to read-only and marked in the PGSTEs of the kvm process.
@@ -1416,7 +1420,8 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
  * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
  * parent gmap table could not be protected.
  */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce)
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+			 int edat_level)
 {
 	struct gmap *sg, *new;
 	unsigned long limit;
@@ -1424,7 +1429,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce)
 
 	BUG_ON(gmap_is_shadow(parent));
 	spin_lock(&parent->shadow_lock);
-	sg = gmap_find_shadow(parent, asce);
+	sg = gmap_find_shadow(parent, asce, edat_level);
 	spin_unlock(&parent->shadow_lock);
 	if (sg)
 		return sg;
@@ -1436,10 +1441,11 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce)
 	new->mm = parent->mm;
 	new->parent = gmap_get(parent);
 	new->orig_asce = asce;
+	new->edat_level = edat_level;
 	new->initialized = false;
 	spin_lock(&parent->shadow_lock);
 	/* Recheck if another CPU created the same shadow */
-	sg = gmap_find_shadow(parent, asce);
+	sg = gmap_find_shadow(parent, asce, edat_level);
 	if (sg) {
 		spin_unlock(&parent->shadow_lock);
 		gmap_free(new);

From fd8d4e3ab6993e194287a59c4d3a6a43da86b8dc Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 18 Apr 2016 13:24:52 +0200
Subject: [PATCH 131/302] s390/mm: support EDAT1 for gmap shadows

If the guest is enabled for EDAT1, we can easily create shadows for
guest2 -> guest3 provided tables that make use of EDAT1.

If guest2 references a 1MB page, this memory looks consecutive for guest2,
but it might not be so for us. Therefore we have to create fake page tables.

We can easily add that to our existing infrastructure. The invalidation
mechanism will make sure that fake page tables are removed when the parent
table (sgt table entry) is changed.

As EDAT1 also introduced protection on all page table levels, we have to
also shadow these correctly.

We don't have to care about:
- ACCF-Validity Control in STE
- Access-Control Bits in STE
- Fetch-Protection Bit in STE
- Common-Segment Bit in STE

As all bits might be dropped and there is no guaranteed that they are
active ("unpredictable whether the CPU uses these bits", "may be used").
Without using EDAT1 in the shadow ourselfes (STE-format control == 0),
simply shadowing these bits would not be enough. They would be ignored.

Please note that we are using the "fake" flag to make this look consistent
with further changes (EDAT2, real-space designation support) and don't let
the shadow functions handle fc=1 stes.

In the future, with huge pages in the host, gmap_shadow_pgt() could simply
try to map a huge host page if "fake" is set to one and indicate via return
value that no lower fake tables / shadow ptes are required.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  5 +++--
 arch/s390/kvm/gaccess.c      | 34 +++++++++++++++++++++++++++-------
 arch/s390/mm/gmap.c          | 29 +++++++++++++++++++++++++----
 3 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 2ab397c8ca092b..c8ba5a197b1d40 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -112,9 +112,10 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t);
 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t);
 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt);
-int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+		    int fake);
 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
-			   unsigned long *pgt, int *dat_protection);
+			   unsigned long *pgt, int *dat_protection, int *fake);
 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
 
 void gmap_register_pte_notifier(struct gmap_notifier *);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index a85bc6c6a09834..af1fc6fa7b74fc 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -953,9 +953,11 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
  * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
  */
 static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
-				  unsigned long *pgt, int *dat_protection)
+				  unsigned long *pgt, int *dat_protection,
+				  int *fake)
 {
 	struct gmap *parent;
 	union asce asce;
@@ -963,6 +965,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 	unsigned long ptr;
 	int rc;
 
+	*fake = 0;
 	parent = sg->parent;
 	vaddr.addr = saddr;
 	asce.val = sg->orig_asce;
@@ -1060,10 +1063,20 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 		if (ste.cs && asce.p)
 			return PGM_TRANSLATION_SPEC;
 		*dat_protection = ste.fc0.p;
-		rc = gmap_shadow_pgt(sg, saddr, ste.val);
+		if (ste.fc && sg->edat_level >= 1) {
+			bool prot = ste.fc1.p;
+
+			*fake = 1;
+			ptr = ste.fc1.sfaa << 20UL;
+			ste.val = ptr;
+			ste.fc0.p = prot;
+			goto shadow_pgt;
+		}
+		ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+		rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
 		if (rc)
 			return rc;
-		ptr = ste.fc0.pto * 2048;
 	}
 	}
 	/* Return the parent address of the page table */
@@ -1089,7 +1102,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
 	union vaddress vaddr;
 	union page_table_entry pte;
 	unsigned long pgt;
-	int dat_protection;
+	int dat_protection, fake;
 	int rc;
 
 	down_read(&sg->mm->mmap_sem);
@@ -1100,17 +1113,24 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
 	 */
 	ipte_lock(vcpu);
 
-	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection);
+	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
 	if (rc)
-		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection);
+		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+					    &fake);
 
 	vaddr.addr = saddr;
+	if (fake) {
+		/* offset in 1MB guest memory block */
+		pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+		goto shadow_page;
+	}
 	if (!rc)
 		rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
 	if (!rc && pte.i)
 		rc = PGM_PAGE_TRANSLATION;
-	if (!rc && (pte.z || pte.co))
+	if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
 		rc = PGM_TRANSLATION_SPEC;
+shadow_page:
 	pte.p |= dat_protection;
 	if (!rc)
 		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index f0b2a531c5999b..de7ad7bd4a4896 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -20,6 +20,8 @@
 #include <asm/gmap.h>
 #include <asm/tlb.h>
 
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
 /**
  * gmap_alloc - allocate and initialize a guest address space
  * @mm: pointer to the parent mm_struct
@@ -1521,6 +1523,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
 	/* mark as invalid as long as the parent table is not protected */
 	*table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= (r2t & _REGION_ENTRY_PROTECT);
 	list_add(&page->lru, &sg->crst_list);
 	spin_unlock(&sg->guest_table_lock);
 	/* Make r2t read-only in parent gmap page table */
@@ -1592,6 +1596,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
 	/* mark as invalid as long as the parent table is not protected */
 	*table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= (r3t & _REGION_ENTRY_PROTECT);
 	list_add(&page->lru, &sg->crst_list);
 	spin_unlock(&sg->guest_table_lock);
 	/* Make r3t read-only in parent gmap page table */
@@ -1664,6 +1670,8 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt)
 	/* mark as invalid as long as the parent table is not protected */
 	*table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
 		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+	if (sg->edat_level >= 1)
+		*table |= sgt & _REGION_ENTRY_PROTECT;
 	list_add(&page->lru, &sg->crst_list);
 	spin_unlock(&sg->guest_table_lock);
 	/* Make sgt read-only in parent gmap page table */
@@ -1698,6 +1706,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
  * @saddr: the address in the shadow aguest address space
  * @pgt: parent gmap address of the page table to get shadowed
  * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
  *
  * Returns 0 if the shadow page table was found and -EAGAIN if the page
  * table was not found.
@@ -1705,7 +1714,8 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
  * Called with sg->mm->mmap_sem in read.
  */
 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
-			   unsigned long *pgt, int *dat_protection)
+			   unsigned long *pgt, int *dat_protection,
+			   int *fake)
 {
 	unsigned long *table;
 	struct page *page;
@@ -1717,8 +1727,9 @@ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
 	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
 		/* Shadow page tables are full pages (pte+pgste) */
 		page = pfn_to_page(*table >> PAGE_SHIFT);
-		*pgt = page->index;
+		*pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
 		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+		*fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
 		rc = 0;
 	} else  {
 		rc = -EAGAIN;
@@ -1734,6 +1745,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
  * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
  *
  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  * shadow table structure is incomplete, -ENOMEM if out of memory,
@@ -1741,19 +1753,22 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
  *
  * Called with gmap->mm->mmap_sem in read
  */
-int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt)
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+		    int fake)
 {
 	unsigned long raddr, origin;
 	unsigned long *s_pgt, *table;
 	struct page *page;
 	int rc;
 
-	BUG_ON(!gmap_is_shadow(sg));
+	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
 	/* Allocate a shadow page table */
 	page = page_table_alloc_pgste(sg->mm);
 	if (!page)
 		return -ENOMEM;
 	page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
 	s_pgt = (unsigned long *) page_to_phys(page);
 	/* Install shadow page table */
 	spin_lock(&sg->guest_table_lock);
@@ -1773,6 +1788,12 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt)
 	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
 		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
 	list_add(&page->lru, &sg->pt_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_SEGMENT_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
 	spin_unlock(&sg->guest_table_lock);
 	/* Make pgt read-only in parent gmap page table (not the pgste) */
 	raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;

From 18b89809881834cecd2977e6048a30c4c8f140fe Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 18 Apr 2016 13:42:05 +0200
Subject: [PATCH 132/302] s390/mm: support EDAT2 for gmap shadows

If the guest is enabled for EDAT2, we can easily create shadows for
guest2 -> guest3 provided tables that make use of EDAT2.

If guest2 references a 2GB page, this memory looks consecutive for guest2,
but it does not have to be so for us. Therefore we have to create fake
segment and page tables.

This works just like EDAT1 support, so page tables are removed when the
parent table (r3t table entry) is changed.

We don't hve to care about:
- ACCF-Validity Control in RTTE
- Access-Control Bits in RTTE
- Fetch-Protection Bit in RTTE
- Common-Region Bit in RTTE

Just like for EDAT1, all bits might be dropped and there is no guaranteed
that they are active.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  3 ++-
 arch/s390/kvm/gaccess.c      | 22 ++++++++++++++++++++--
 arch/s390/mm/gmap.c          | 14 ++++++++++++--
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index c8ba5a197b1d40..2e4c3b222a9625 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -111,7 +111,8 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 			 int edat_level);
 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t);
 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t);
-int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+		    int fake);
 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 		    int fake);
 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index af1fc6fa7b74fc..fab03ecb5bd544 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1042,17 +1042,35 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 			return PGM_REGION_THIRD_TRANS;
 		if (rtte.tt != TABLE_TYPE_REGION3)
 			return PGM_TRANSLATION_SPEC;
+		if (rtte.cr && asce.p && sg->edat_level >= 2)
+			return PGM_TRANSLATION_SPEC;
+		if (rtte.fc && sg->edat_level >= 2) {
+			bool prot = rtte.fc1.p;
+
+			*fake = 1;
+			ptr = rtte.fc1.rfaa << 31UL;
+			rtte.val = ptr;
+			rtte.fc0.p = prot;
+			goto shadow_sgt;
+		}
 		if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
 			return PGM_SEGMENT_TRANSLATION;
-		rc = gmap_shadow_sgt(sg, saddr, rtte.val);
+		ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+		rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
 		if (rc)
 			return rc;
-		ptr = rtte.fc0.sto * 4096;
 		/* fallthrough */
 	}
 	case ASCE_TYPE_SEGMENT: {
 		union segment_table_entry ste;
 
+		if (*fake) {
+			/* offset in 2G guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+			ste.val = ptr;
+			goto shadow_pgt;
+		}
 		rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
 		if (rc)
 			return rc;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index de7ad7bd4a4896..c96bf30245c0b3 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1631,6 +1631,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
  * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
  *
  * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
  * shadow table structure is incomplete, -ENOMEM if out of memory and
@@ -1638,19 +1639,22 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
  *
  * Called with sg->mm->mmap_sem in read.
  */
-int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt)
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
 	unsigned long *s_sgt, *table;
 	struct page *page;
 	int rc;
 
-	BUG_ON(!gmap_is_shadow(sg));
+	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
 	/* Allocate a shadow segment table */
 	page = alloc_pages(GFP_KERNEL, 2);
 	if (!page)
 		return -ENOMEM;
 	page->index = sgt & _REGION_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
 	s_sgt = (unsigned long *) page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
@@ -1673,6 +1677,12 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt)
 	if (sg->edat_level >= 1)
 		*table |= sgt & _REGION_ENTRY_PROTECT;
 	list_add(&page->lru, &sg->crst_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
 	spin_unlock(&sg->guest_table_lock);
 	/* Make sgt read-only in parent gmap page table */
 	raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;

From 1c65781b56ce812ce9729bf414201921c9408678 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 18 Apr 2016 17:46:21 +0200
Subject: [PATCH 133/302] s390/mm: push rte protection down to shadow pte

Just like we already do with ste protection, let's take rte protection
into account. This way, the host pte doesn't have to be mapped writable.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index fab03ecb5bd544..f6d556dfafcddb 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -966,6 +966,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 	int rc;
 
 	*fake = 0;
+	*dat_protection = 0;
 	parent = sg->parent;
 	vaddr.addr = saddr;
 	asce.val = sg->orig_asce;
@@ -1008,6 +1009,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 			return PGM_TRANSLATION_SPEC;
 		if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
 			return PGM_REGION_SECOND_TRANS;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rfte.p;
 		rc = gmap_shadow_r2t(sg, saddr, rfte.val);
 		if (rc)
 			return rc;
@@ -1026,6 +1029,9 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 			return PGM_TRANSLATION_SPEC;
 		if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
 			return PGM_REGION_THIRD_TRANS;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rste.p;
+		rste.p |= *dat_protection;
 		rc = gmap_shadow_r3t(sg, saddr, rste.val);
 		if (rc)
 			return rc;
@@ -1045,18 +1051,19 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 		if (rtte.cr && asce.p && sg->edat_level >= 2)
 			return PGM_TRANSLATION_SPEC;
 		if (rtte.fc && sg->edat_level >= 2) {
-			bool prot = rtte.fc1.p;
-
+			*dat_protection |= rtte.fc0.p;
 			*fake = 1;
 			ptr = rtte.fc1.rfaa << 31UL;
 			rtte.val = ptr;
-			rtte.fc0.p = prot;
 			goto shadow_sgt;
 		}
 		if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
 			return PGM_SEGMENT_TRANSLATION;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rtte.fc0.p;
 		ptr = rtte.fc0.sto << 12UL;
 shadow_sgt:
+		rtte.fc0.p |= *dat_protection;
 		rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
 		if (rc)
 			return rc;
@@ -1080,18 +1087,16 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 			return PGM_TRANSLATION_SPEC;
 		if (ste.cs && asce.p)
 			return PGM_TRANSLATION_SPEC;
-		*dat_protection = ste.fc0.p;
+		*dat_protection |= ste.fc0.p;
 		if (ste.fc && sg->edat_level >= 1) {
-			bool prot = ste.fc1.p;
-
 			*fake = 1;
 			ptr = ste.fc1.sfaa << 20UL;
 			ste.val = ptr;
-			ste.fc0.p = prot;
 			goto shadow_pgt;
 		}
 		ptr = ste.fc0.pto << 11UL;
 shadow_pgt:
+		ste.fc0.p |= *dat_protection;
 		rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
 		if (rc)
 			return rc;

From 3218f7094b6b583f4f01bffcf84572c6beacdcc2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 18 Apr 2016 16:22:24 +0200
Subject: [PATCH 134/302] s390/mm: support real-space for gmap shadows

We can easily support real-space designation just like EDAT1 and EDAT2.
So guest2 can provide for guest3 an asce with the real-space control being
set.

We simply have to allocate the biggest page table possible and fake all
levels.

There is no protection to consider. If we exceed guest memory, vsie code
will inject an addressing exception (via program intercept). In the future,
we could limit the fake table level to the gmap page table.

As the top level page table can never go away, such gmap shadows will never
get unshadowed, we'll have to come up with another way to limit the number
of kept gmap shadows.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  6 ++++--
 arch/s390/kvm/gaccess.c      | 34 +++++++++++++++++++++++++++++-----
 arch/s390/mm/gmap.c          | 35 ++++++++++++++++++++++++++++++++---
 3 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 2e4c3b222a9625..752cf47a81abf3 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -109,8 +109,10 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
 
 struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 			 int edat_level);
-int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t);
-int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+		    int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+		    int fake);
 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 		    int fake);
 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index f6d556dfafcddb..54200208bf24fe 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -971,9 +971,13 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 	vaddr.addr = saddr;
 	asce.val = sg->orig_asce;
 	ptr = asce.origin * 4096;
+	if (asce.r) {
+		*fake = 1;
+		asce.dt = ASCE_TYPE_REGION1;
+	}
 	switch (asce.dt) {
 	case ASCE_TYPE_REGION1:
-		if (vaddr.rfx01 > asce.tl)
+		if (vaddr.rfx01 > asce.tl && !asce.r)
 			return PGM_REGION_FIRST_TRANS;
 		break;
 	case ASCE_TYPE_REGION2:
@@ -1000,6 +1004,12 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 	case ASCE_TYPE_REGION1: {
 		union region1_table_entry rfte;
 
+		if (*fake) {
+			/* offset in 16EB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+			rfte.val = ptr;
+			goto shadow_r2t;
+		}
 		rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
 		if (rc)
 			return rc;
@@ -1011,15 +1021,22 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 			return PGM_REGION_SECOND_TRANS;
 		if (sg->edat_level >= 1)
 			*dat_protection |= rfte.p;
-		rc = gmap_shadow_r2t(sg, saddr, rfte.val);
+		ptr = rfte.rto << 12UL;
+shadow_r2t:
+		rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
 		if (rc)
 			return rc;
-		ptr = rfte.rto * 4096;
 		/* fallthrough */
 	}
 	case ASCE_TYPE_REGION2: {
 		union region2_table_entry rste;
 
+		if (*fake) {
+			/* offset in 8PB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+			rste.val = ptr;
+			goto shadow_r3t;
+		}
 		rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
 		if (rc)
 			return rc;
@@ -1031,16 +1048,23 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
 			return PGM_REGION_THIRD_TRANS;
 		if (sg->edat_level >= 1)
 			*dat_protection |= rste.p;
+		ptr = rste.rto << 12UL;
+shadow_r3t:
 		rste.p |= *dat_protection;
-		rc = gmap_shadow_r3t(sg, saddr, rste.val);
+		rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
 		if (rc)
 			return rc;
-		ptr = rste.rto * 4096;
 		/* fallthrough */
 	}
 	case ASCE_TYPE_REGION3: {
 		union region3_table_entry rtte;
 
+		if (*fake) {
+			/* offset in 4TB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+			rtte.val = ptr;
+			goto shadow_sgt;
+		}
 		rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
 		if (rc)
 			return rc;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index c96bf30245c0b3..c07d64f5cdb5a5 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1437,6 +1437,8 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 		return sg;
 	/* Create a new shadow gmap */
 	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+	if (asce & _ASCE_REAL_SPACE)
+		limit = -1UL;
 	new = gmap_alloc(limit);
 	if (!new)
 		return ERR_PTR(-ENOMEM);
@@ -1455,6 +1457,12 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 	}
 	atomic_set(&new->ref_count, 2);
 	list_add(&new->list, &parent->children);
+	if (asce & _ASCE_REAL_SPACE) {
+		/* nothing to protect, return right away */
+		new->initialized = true;
+		spin_unlock(&parent->shadow_lock);
+		return new;
+	}
 	spin_unlock(&parent->shadow_lock);
 	/* protect after insertion, so it will get properly invalidated */
 	down_read(&parent->mm->mmap_sem);
@@ -1479,6 +1487,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
  * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
  *
  * The r2t parameter specifies the address of the source table. The
  * four pages of the source table are made read-only in the parent gmap
@@ -1491,7 +1500,8 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
  *
  * Called with sg->mm->mmap_sem in read.
  */
-int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
 	unsigned long *s_r2t, *table;
@@ -1504,6 +1514,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
 	if (!page)
 		return -ENOMEM;
 	page->index = r2t & _REGION_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
 	s_r2t = (unsigned long *) page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
@@ -1526,6 +1538,12 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
 	if (sg->edat_level >= 1)
 		*table |= (r2t & _REGION_ENTRY_PROTECT);
 	list_add(&page->lru, &sg->crst_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
 	spin_unlock(&sg->guest_table_lock);
 	/* Make r2t read-only in parent gmap page table */
 	raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
@@ -1558,6 +1576,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
  * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
  *
  * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  * shadow table structure is incomplete, -ENOMEM if out of memory and
@@ -1565,7 +1584,8 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
  *
  * Called with sg->mm->mmap_sem in read.
  */
-int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+		    int fake)
 {
 	unsigned long raddr, origin, offset, len;
 	unsigned long *s_r3t, *table;
@@ -1578,6 +1598,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
 	if (!page)
 		return -ENOMEM;
 	page->index = r3t & _REGION_ENTRY_ORIGIN;
+	if (fake)
+		page->index |= GMAP_SHADOW_FAKE_TABLE;
 	s_r3t = (unsigned long *) page_to_phys(page);
 	/* Install shadow region second table */
 	spin_lock(&sg->guest_table_lock);
@@ -1599,6 +1621,12 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
 	if (sg->edat_level >= 1)
 		*table |= (r3t & _REGION_ENTRY_PROTECT);
 	list_add(&page->lru, &sg->crst_list);
+	if (fake) {
+		/* nothing to protect for fake tables */
+		*table &= ~_REGION_ENTRY_INVALID;
+		spin_unlock(&sg->guest_table_lock);
+		return 0;
+	}
 	spin_unlock(&sg->guest_table_lock);
 	/* Make r3t read-only in parent gmap page table */
 	raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
@@ -1932,7 +1960,8 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
 	/* Check for top level table */
 	start = sg->orig_asce & _ASCE_ORIGIN;
 	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
-	if (gaddr >= start && gaddr < end) {
+	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+	    gaddr < end) {
 		/* The complete shadow table has to go */
 		gmap_unshadow(sg);
 		spin_unlock(&sg->guest_table_lock);

From 717c05554afa69a36398a57dac64b95972f138d5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 2 May 2016 12:10:17 +0200
Subject: [PATCH 135/302] s390/mm: limit number of real-space gmap shadows

We have no known user of real-space designation and only support it to
be architecture compliant.

Gmap shadows with real-space designation are never unshadowed
automatically, as there is nothing to protect for the top level table.

So let's simply limit the number of such shadows to one by removing
existing ones on creation of another one.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index c07d64f5cdb5a5..4a1434bc2f0e7b 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1455,6 +1455,19 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 		gmap_free(new);
 		return sg;
 	}
+	if (asce & _ASCE_REAL_SPACE) {
+		/* only allow one real-space gmap shadow */
+		list_for_each_entry(sg, &parent->children, list) {
+			if (sg->orig_asce & _ASCE_REAL_SPACE) {
+				spin_lock(&sg->guest_table_lock);
+				gmap_unshadow(sg);
+				spin_unlock(&sg->guest_table_lock);
+				list_del(&sg->list);
+				gmap_put(sg);
+				break;
+			}
+		}
+	}
 	atomic_set(&new->ref_count, 2);
 	list_add(&new->list, &parent->children);
 	if (asce & _ASCE_REAL_SPACE) {

From 4a49443924731823da2e9b3ae9311b74a34e7ed8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 12:31:52 +0100
Subject: [PATCH 136/302] s390/mm: remember the int code for the last gmap
 fault

For nested virtualization, we want to know if we are handling a protection
exception, because these can directly be forwarded to the guest without
additional checks.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/processor.h | 1 +
 arch/s390/mm/fault.c              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 94c80b6d031d13..8c2922f540f9e1 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -110,6 +110,7 @@ struct thread_struct {
 	mm_segment_t mm_segment;
 	unsigned long gmap_addr;	/* address of last gmap fault. */
 	unsigned int gmap_write_flag;	/* gmap fault write indication */
+	unsigned int gmap_int_code;	/* int code of last gmap fault */
 	unsigned int gmap_pfault;	/* signal of a pending guest pfault */
 	struct per_regs per_user;	/* User specified PER registers */
 	struct per_event per_event;	/* Cause of the last PER trap */
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index b84416c11c434d..730e0d3aa840dd 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -419,6 +419,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	if (gmap) {
 		current->thread.gmap_addr = address;
 		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+		current->thread.gmap_int_code = regs->int_code & 0xffff;
 		address = __gmap_translate(gmap, address);
 		if (address == -EFAULT) {
 			fault = VM_FAULT_BADMAP;

From 5b6c963bcef5c3a857e3f8ba84aa9380069fc95f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 27 May 2016 18:57:33 +0200
Subject: [PATCH 137/302] s390/mm: allow to check if a gmap shadow is valid

It will be very helpful to have a mechanism to check without any locks
if a given gmap shadow is still valid and matches the given properties.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h |  1 +
 arch/s390/mm/gmap.c          | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 752cf47a81abf3..c67fb854705ee0 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -109,6 +109,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
 
 struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 			 int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 		    int fake);
 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 4a1434bc2f0e7b..d00e4abb559e10 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1407,6 +1407,26 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
 	return NULL;
 }
 
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ *                     given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+	if (sg->removed)
+		return 0;
+	return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
 /**
  * gmap_shadow - create/find a shadow guest address space
  * @parent: pointer to the parent gmap

From 01f719176f28016da1b588f6560a4eef18a98a93 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 13 Jun 2016 10:49:04 +0200
Subject: [PATCH 138/302] s390/mm: don't fault everything in read-write in
 gmap_pte_op_fixup()

Let's not fault in everything in read-write but limit it to read-only
where possible.

When restricting access rights, we already have the required protection
level in our hands. When reading from guest 2 storage (gmap_read_table),
it is obviously PROT_READ. When shadowing a pte, the required protection
level is given via the guest 2 provided pte.

Based on an initial patch by Martin Schwidefsky.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index d00e4abb559e10..738d75495e5641 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -811,19 +811,22 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
  * @gmap: pointer to guest mapping meta data structure
  * @gaddr: virtual address in the guest address space
  * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  *
  * Returns 0 if the caller can retry __gmap_translate (might fail again),
  * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
  * up or connecting the gmap page table.
  */
 static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
-			     unsigned long vmaddr)
+			     unsigned long vmaddr, int prot)
 {
 	struct mm_struct *mm = gmap->mm;
+	unsigned int fault_flags;
 	bool unlocked = false;
 
 	BUG_ON(gmap_is_shadow(gmap));
-	if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked))
+	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+	if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
 		return -EFAULT;
 	if (unlocked)
 		/* lost mmap_sem, caller has to retry __gmap_translate */
@@ -875,7 +878,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 			vmaddr = __gmap_translate(gmap, gaddr);
 			if (IS_ERR_VALUE(vmaddr))
 				return vmaddr;
-			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
+			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
 			if (rc)
 				return rc;
 			continue;
@@ -957,7 +960,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
 			rc = vmaddr;
 			break;
 		}
-		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
+		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
 		if (rc)
 			break;
 	}
@@ -1041,7 +1044,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 		radix_tree_preload_end();
 		if (rc) {
 			kfree(rmap);
-			rc = gmap_pte_op_fixup(parent, paddr, vmaddr);
+			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
 			if (rc)
 				return rc;
 			continue;
@@ -1910,10 +1913,12 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 	unsigned long vmaddr, paddr;
 	spinlock_t *ptl;
 	pte_t *sptep, *tptep;
+	int prot;
 	int rc;
 
 	BUG_ON(!gmap_is_shadow(sg));
 	parent = sg->parent;
+	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
 
 	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
 	if (!rmap)
@@ -1955,7 +1960,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
 		radix_tree_preload_end();
 		if (!rc)
 			break;
-		rc = gmap_pte_op_fixup(parent, paddr, vmaddr);
+		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
 		if (rc)
 			break;
 	}

From 65d0b0d4bcc67b596d8e7286c3bebf24c59ade6a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 27 Apr 2015 16:29:34 +0200
Subject: [PATCH 139/302] KVM: s390: fast path for shadow gmaps in gmap
 notifier

The default kvm gmap notifier doesn't have to handle shadow gmaps.
So let's just directly exit in case we get notified about one.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 9dd52980605cb0..45a8316ba1ebe6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1986,6 +1986,8 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
 	unsigned long prefix;
 	int i;
 
+	if (gmap_is_shadow(gmap))
+		return;
 	if (start >= 1UL << 31)
 		/* We are only interested in prefix pages */
 		return;

From 37d9df98b71afdf3baf41ee5451b6206c13328c6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 11 Mar 2015 16:47:33 +0100
Subject: [PATCH 140/302] KVM: s390: backup the currently enabled gmap when
 scheduled out

Nested virtualization will have to enable own gmaps. Current code
would enable the wrong gmap whenever scheduled out and back in,
therefore resulting in the wrong gmap being enabled.

This patch reenables the last enabled gmap, therefore avoiding having to
touch vcpu->arch.gmap when enabling a different gmap.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/gmap.h     |  1 +
 arch/s390/include/asm/kvm_host.h |  2 ++
 arch/s390/kvm/kvm-s390.c         |  8 +++++---
 arch/s390/mm/gmap.c              | 11 +++++++++++
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index c67fb854705ee0..741ddba0bf11b5 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -94,6 +94,7 @@ void gmap_put(struct gmap *gmap);
 
 void gmap_enable(struct gmap *gmap);
 void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
 int gmap_map_segment(struct gmap *gmap, unsigned long from,
 		     unsigned long to, unsigned long len);
 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9eed5c18a61c39..96bef30e2e33f4 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -551,6 +551,8 @@ struct kvm_vcpu_arch {
 	struct hrtimer    ckc_timer;
 	struct kvm_s390_pgm_info pgm;
 	struct gmap *gmap;
+	/* backup location for the currently enabled gmap when scheduled out */
+	struct gmap *enabled_gmap;
 	struct kvm_guestdbg_info_arch guestdbg;
 	unsigned long pfault_token;
 	unsigned long pfault_select;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 45a8316ba1ebe6..a890f7d207115a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1719,7 +1719,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
-	gmap_enable(vcpu->arch.gmap);
+	gmap_enable(vcpu->arch.enabled_gmap);
 	atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__start_cpu_timer_accounting(vcpu);
@@ -1732,7 +1732,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__stop_cpu_timer_accounting(vcpu);
 	atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-	gmap_disable(vcpu->arch.gmap);
+	vcpu->arch.enabled_gmap = gmap_get_enabled();
+	gmap_disable(vcpu->arch.enabled_gmap);
 
 	/* Save guest register state */
 	save_fpu_regs();
@@ -1781,7 +1782,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 		vcpu->arch.gmap = vcpu->kvm->arch.gmap;
 		sca_add_vcpu(vcpu);
 	}
-
+	/* make vcpu_load load the right gmap on the first trigger */
+	vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 738d75495e5641..af0ae6d7ac59fd 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -270,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
 }
 EXPORT_SYMBOL_GPL(gmap_disable);
 
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+	return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
 /*
  * gmap_alloc_table is assumed to be called with mmap_sem held
  */

From 3d84683bd737e397ae200e881a3230e469c59ad6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 16 Nov 2015 10:45:03 +0100
Subject: [PATCH 141/302] s390: introduce page_to_virt() and pfn_to_virt()

Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/page.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index f874e7d51c1919..b5edff32e547b6 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -147,6 +147,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn)	__va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page)	pfn_to_virt(page_to_pfn(page))
 
 #define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | \
 				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

From df9b2b4a4aa49f874f8507680a533369e4b9c378 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 20 Jun 2016 12:09:41 +0200
Subject: [PATCH 142/302] mm/page_ref: introduce page_ref_inc_return

Let's introduce that helper.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 include/linux/page_ref.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 8b5e0a9f2431a7..610e1327191849 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 	return ret;
 }
 
+static inline int page_ref_inc_return(struct page *page)
+{
+	int ret = atomic_inc_return(&page->_refcount);
+
+	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+		__page_ref_mod_and_return(page, 1, ret);
+	return ret;
+}
+
 static inline int page_ref_dec_and_test(struct page *page)
 {
 	int ret = atomic_dec_and_test(&page->_refcount);

From a3508fbe9dc6dd3bece0c7bf889cc085a011738c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 8 Jul 2015 13:19:48 +0200
Subject: [PATCH 143/302] KVM: s390: vsie: initial support for nested
 virtualization

This patch adds basic support for nested virtualization on s390x, called
VSIE (virtual SIE) and allows it to be used by the guest if the necessary
facilities are supported by the hardware and enabled for the guest.

In order to make this work, we have to shadow the sie control block
provided by guest 2. In order to gain some performance, we have to
reuse the same shadow blocks as good as possible. For now, we allow
as many shadow blocks as we have VCPUs (that way, every VCPU can run the
VSIE concurrently).

We have to watch out for the prefix getting unmapped out of our shadow
gmap and properly get the VCPU out of VSIE in that case, to fault the
prefix pages back in. We use the PROG_REQUEST bit for that purpose.

This patch is based on an initial prototype by Tobias Elpelt.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  15 +-
 arch/s390/include/uapi/asm/kvm.h |   1 +
 arch/s390/kvm/Makefile           |   2 +-
 arch/s390/kvm/kvm-s390.c         |  15 +
 arch/s390/kvm/kvm-s390.h         |   7 +
 arch/s390/kvm/priv.c             |   1 +
 arch/s390/kvm/vsie.c             | 755 +++++++++++++++++++++++++++++++
 7 files changed, 794 insertions(+), 2 deletions(-)
 create mode 100644 arch/s390/kvm/vsie.c

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 96bef30e2e33f4..255609c8690113 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -145,7 +145,7 @@ struct kvm_s390_sie_block {
 	__u64	cputm;			/* 0x0028 */
 	__u64	ckc;			/* 0x0030 */
 	__u64	epoch;			/* 0x0038 */
-	__u8	reserved40[4];		/* 0x0040 */
+	__u32	svcc;			/* 0x0040 */
 #define LCTL_CR0	0x8000
 #define LCTL_CR6	0x0200
 #define LCTL_CR9	0x0040
@@ -167,6 +167,9 @@ struct kvm_s390_sie_block {
 #define ICPT_INST	0x04
 #define ICPT_PROGI	0x08
 #define ICPT_INSTPROGI	0x0C
+#define ICPT_EXTINT	0x14
+#define ICPT_VALIDITY	0x20
+#define ICPT_STOP	0x28
 #define ICPT_OPEREXC	0x2C
 #define ICPT_PARTEXEC	0x38
 #define ICPT_IOINST	0x40
@@ -281,6 +284,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_stsi;
 	u32 instruction_stfl;
 	u32 instruction_tprot;
+	u32 instruction_sie;
 	u32 instruction_essa;
 	u32 instruction_sthyi;
 	u32 instruction_sigp_sense;
@@ -637,6 +641,14 @@ struct sie_page2 {
 	u8 reserved900[0x1000 - 0x900];			/* 0x0900 */
 } __packed;
 
+struct kvm_s390_vsie {
+	struct mutex mutex;
+	struct radix_tree_root addr_to_page;
+	int page_count;
+	int next;
+	struct page *pages[KVM_MAX_VCPUS];
+};
+
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -661,6 +673,7 @@ struct kvm_arch{
 	struct sie_page2 *sie_page2;
 	struct kvm_s390_cpu_model model;
 	struct kvm_s390_crypto crypto;
+	struct kvm_s390_vsie vsie;
 	u64 epoch;
 	/* subset of available cpu features enabled by user space */
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index f0818d70d73dc8..62423b1931c002 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -98,6 +98,7 @@ struct kvm_s390_vm_cpu_machine {
 
 #define KVM_S390_VM_CPU_FEAT_NR_BITS	1024
 #define KVM_S390_VM_CPU_FEAT_ESOP	0
+#define KVM_S390_VM_CPU_FEAT_SIEF2	1
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 82e73e2b953d17..09a9e6dfc09f66 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
 
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index a890f7d207115a..3fb124226e9796 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -99,6 +99,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
 	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+	{ "instruction_sie", VCPU_STAT(instruction_sie) },
 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
 	{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
 	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -142,6 +143,7 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS)
 static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
 
 static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
 
 /* Section: not file related */
@@ -187,6 +189,8 @@ int kvm_arch_hardware_setup(void)
 {
 	gmap_notifier.notifier_call = kvm_gmap_notifier;
 	gmap_register_pte_notifier(&gmap_notifier);
+	vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+	gmap_register_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
 				       &kvm_clock_notifier);
 	return 0;
@@ -195,6 +199,7 @@ int kvm_arch_hardware_setup(void)
 void kvm_arch_hardware_unsetup(void)
 {
 	gmap_unregister_pte_notifier(&gmap_notifier);
+	gmap_unregister_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
 					 &kvm_clock_notifier);
 }
@@ -252,6 +257,14 @@ static void kvm_s390_cpu_feat_init(void)
 
 	if (MACHINE_HAS_ESOP)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+	/*
+	 * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+	 * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+	 */
+	if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+	    !test_facility(3))
+		return;
+	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
 }
 
 int kvm_arch_init(void *opaque)
@@ -1406,6 +1419,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.epoch = 0;
 
 	spin_lock_init(&kvm->arch.start_stop_lock);
+	kvm_s390_vsie_init(kvm);
 	KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
 	return 0;
@@ -1463,6 +1477,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		gmap_remove(kvm->arch.gmap);
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
+	kvm_s390_vsie_destroy(kvm);
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 52aa47e112d801..b137fbaac91cd8 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -252,6 +252,13 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+				 unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 3db3be13999299..c77ad2dc334ff7 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -719,6 +719,7 @@ static const intercept_handler_t b2_handlers[256] = {
 	[0x10] = handle_set_prefix,
 	[0x11] = handle_store_prefix,
 	[0x12] = handle_store_cpu_address,
+	[0x14] = kvm_s390_handle_vsie,
 	[0x21] = handle_ipte_interlock,
 	[0x29] = handle_iske,
 	[0x2a] = handle_rrbe,
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644
index 00000000000000..747d4f9001555f
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,755 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+	struct kvm_s390_sie_block scb_s;	/* 0x0000 */
+	/* the pinned originial scb */
+	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
+	/* the shadow gmap in use by the vsie_page */
+	struct gmap *gmap;			/* 0x0208 */
+	__u8 reserved[0x1000 - 0x0210];		/* 0x0210 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+			     __u16 reason_code)
+{
+	scb->ipa = 0x1000;
+	scb->ipb = ((__u32) reason_code) << 16;
+	scb->icptcode = ICPT_VALIDITY;
+	return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+	atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+	prefix_unmapped(vsie_page);
+	if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+		atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+	while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+		cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+	atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+	const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+	int cpuflags;
+
+	cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+	atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+	atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags  */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+	/* we don't allow ESA/390 guests */
+	if (!(cpuflags & CPUSTAT_ZARCH))
+		return set_validity_icpt(scb_s, 0x0001U);
+
+	if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+		return set_validity_icpt(scb_s, 0x0001U);
+	else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+		return set_validity_icpt(scb_s, 0x0007U);
+
+	/* intervention requests will be set later */
+	newflags = CPUSTAT_ZARCH;
+
+	atomic_set(&scb_s->cpuflags, newflags);
+	return 0;
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+	/* interception */
+	scb_o->icptcode = scb_s->icptcode;
+	scb_o->icptstatus = scb_s->icptstatus;
+	scb_o->ipa = scb_s->ipa;
+	scb_o->ipb = scb_s->ipb;
+	scb_o->gbea = scb_s->gbea;
+
+	/* timer */
+	scb_o->cputm = scb_s->cputm;
+	scb_o->ckc = scb_s->ckc;
+	scb_o->todpr = scb_s->todpr;
+
+	/* guest state */
+	scb_o->gpsw = scb_s->gpsw;
+	scb_o->gg14 = scb_s->gg14;
+	scb_o->gg15 = scb_s->gg15;
+	memcpy(scb_o->gcr, scb_s->gcr, 128);
+	scb_o->pp = scb_s->pp;
+
+	/* interrupt intercept */
+	switch (scb_s->icptcode) {
+	case ICPT_PROGI:
+	case ICPT_INSTPROGI:
+	case ICPT_EXTINT:
+		memcpy((void *)((u64)scb_o + 0xc0),
+		       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+		break;
+	case ICPT_PARTEXEC:
+		/* MVPG only */
+		memcpy((void *)((u64)scb_o + 0xc0),
+		       (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+		break;
+	}
+
+	if (scb_s->ihcpu != 0xffffU)
+		scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	int rc;
+
+	/* make sure we don't have any leftovers when reusing the scb */
+	scb_s->icptcode = 0;
+	scb_s->eca = 0;
+	scb_s->ecb = 0;
+	scb_s->ecb2 = 0;
+	scb_s->ecb3 = 0;
+	scb_s->ecd = 0;
+
+	rc = prepare_cpuflags(vcpu, vsie_page);
+	if (rc)
+		goto out;
+
+	/* timer */
+	scb_s->cputm = scb_o->cputm;
+	scb_s->ckc = scb_o->ckc;
+	scb_s->todpr = scb_o->todpr;
+	scb_s->epoch = scb_o->epoch;
+
+	/* guest state */
+	scb_s->gpsw = scb_o->gpsw;
+	scb_s->gg14 = scb_o->gg14;
+	scb_s->gg15 = scb_o->gg15;
+	memcpy(scb_s->gcr, scb_o->gcr, 128);
+	scb_s->pp = scb_o->pp;
+
+	/* interception / execution handling */
+	scb_s->gbea = scb_o->gbea;
+	scb_s->lctl = scb_o->lctl;
+	scb_s->svcc = scb_o->svcc;
+	scb_s->ictl = scb_o->ictl;
+	/*
+	 * SKEY handling functions can't deal with false setting of PTE invalid
+	 * bits. Therefore we cannot provide interpretation and would later
+	 * have to provide own emulation handlers.
+	 */
+	scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+	scb_s->icpua = scb_o->icpua;
+
+	 /* SIE will do mso/msl validity and exception checks for us */
+	scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+	scb_s->mso = scb_o->mso & 0xfffffffffff00000UL;
+	scb_s->prefix = scb_o->prefix;
+
+	/* We have to definetly flush the tlb if this scb never ran */
+	if (scb_s->ihcpu != 0xffffU)
+		scb_s->ihcpu = scb_o->ihcpu;
+
+	/* MVPG and Protection Exception Interpretation are always available */
+	scb_s->eca |= scb_o->eca & 0x01002000U;
+
+out:
+	if (rc)
+		unshadow_scb(vcpu, vsie_page);
+	return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+				 unsigned long end)
+{
+	struct kvm *kvm = gmap->private;
+	struct vsie_page *cur;
+	unsigned long prefix;
+	struct page *page;
+	int i;
+
+	if (!gmap_is_shadow(gmap))
+		return;
+	if (start >= 1UL << 31)
+		/* We are only interested in prefix pages */
+		return;
+
+	/*
+	 * Only new shadow blocks are added to the list during runtime,
+	 * therefore we can safely reference them all the time.
+	 */
+	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+		page = READ_ONCE(kvm->arch.vsie.pages[i]);
+		if (!page)
+			continue;
+		cur = page_to_virt(page);
+		if (READ_ONCE(cur->gmap) != gmap)
+			continue;
+		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+		/* with mso/msl, the prefix lies at an offset */
+		prefix += cur->scb_s.mso;
+		if (prefix <= end && start <= prefix + PAGE_SIZE - 1)
+			prefix_unmapped_sync(cur);
+	}
+}
+
+/*
+ * Map the first prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ *          - > 0 if control has to be given to guest 2
+ *          - -EAGAIN if the caller can retry immediately
+ *          - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+	int rc;
+
+	/* mark it as mapped so we can catch any concurrent unmappers */
+	prefix_mapped(vsie_page);
+
+	/* with mso/msl, the prefix lies at offset *mso* */
+	prefix += scb_s->mso;
+
+	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+	/*
+	 * We don't have to mprotect, we will be called for all unshadows.
+	 * SIE will detect if protection applies and trigger a validity.
+	 */
+	if (rc)
+		prefix_unmapped(vsie_page);
+	if (rc > 0 || rc == -EFAULT)
+		rc = set_validity_icpt(scb_s, 0x0037U);
+	return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ *          - -EINVAL if the gpa is not valid guest storage
+ *          - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+	struct page *page;
+	hva_t hva;
+	int rc;
+
+	hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+	if (kvm_is_error_hva(hva))
+		return -EINVAL;
+	rc = get_user_pages_fast(hva, 1, 1, &page);
+	if (rc < 0)
+		return rc;
+	else if (rc != 1)
+		return -ENOMEM;
+	*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+	return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+	struct page *page;
+
+	page = virt_to_page(hpa);
+	set_page_dirty_lock(page);
+	put_page(page);
+	/* mark the page always as dirty for migration */
+	mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	hpa_t hpa;
+	gpa_t gpa;
+
+	hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+	if (hpa) {
+		gpa = scb_o->scaol & ~0xfUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->scaol = 0;
+		scb_s->scaoh = 0;
+	}
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	hpa_t hpa;
+	gpa_t gpa;
+	int rc = 0;
+
+	gpa = scb_o->scaol & ~0xfUL;
+	if (gpa) {
+		if (!(gpa & ~0x1fffUL))
+			rc = set_validity_icpt(scb_s, 0x0038U);
+		else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+			rc = set_validity_icpt(scb_s, 0x0011U);
+		else if ((gpa & PAGE_MASK) !=
+			 ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+			rc = set_validity_icpt(scb_s, 0x003bU);
+		if (!rc) {
+			rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+			if (rc == -EINVAL)
+				rc = set_validity_icpt(scb_s, 0x0034U);
+		}
+		if (rc)
+			goto unpin;
+		scb_s->scaoh = (u32)((u64)hpa >> 32);
+		scb_s->scaol = (u32)(u64)hpa;
+	}
+	return 0;
+unpin:
+	unpin_blocks(vcpu, vsie_page);
+	return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+		      gpa_t gpa)
+{
+	hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+	if (hpa)
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+	vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+		   gpa_t gpa)
+{
+	hpa_t hpa;
+	int rc;
+
+	rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+	if (rc == -EINVAL) {
+		rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		if (!rc)
+			rc = 1;
+	}
+	if (!rc)
+		vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+	return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ *            < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+			bool write_flag)
+{
+	struct kvm_s390_pgm_info pgm = {
+		.code = code,
+		.trans_exc_code =
+			/* 0-51: virtual address */
+			(vaddr & 0xfffffffffffff000UL) |
+			/* 52-53: store / fetch */
+			(((unsigned int) !write_flag) + 1) << 10,
+			/* 62-63: asce id (alway primary == 0) */
+		.exc_access_id = 0, /* always primary */
+		.op_access_id = 0, /* not MVPG */
+	};
+	int rc;
+
+	if (code == PGM_PROTECTION)
+		pgm.trans_exc_code |= 0x4UL;
+
+	rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+	return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	int rc;
+
+	if (current->thread.gmap_int_code == PGM_PROTECTION)
+		/* we can directly forward all protection exceptions */
+		return inject_fault(vcpu, PGM_PROTECTION,
+				    current->thread.gmap_addr, 1);
+
+	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+				   current->thread.gmap_addr);
+	if (rc > 0) {
+		rc = inject_fault(vcpu, rc,
+				  current->thread.gmap_addr,
+				  current->thread.gmap_write_flag);
+	}
+	return rc;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+	vsie_page->scb_s.icptcode = 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	int rc;
+
+	if (need_resched())
+		schedule();
+	if (test_cpu_flag(CIF_MCCK_PENDING))
+		s390_handle_mcck();
+
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	local_irq_disable();
+	kvm_guest_enter();
+	local_irq_enable();
+
+	rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+	local_irq_disable();
+	kvm_guest_exit();
+	local_irq_enable();
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	if (rc > 0)
+		rc = 0; /* we could still have an icpt */
+	else if (rc == -EFAULT)
+		return handle_fault(vcpu, vsie_page);
+
+	switch (scb_s->icptcode) {
+	case ICPT_STOP:
+		/* stop not requested by g2 - must have been a kick */
+		if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+			clear_vsie_icpt(vsie_page);
+		break;
+	case ICPT_VALIDITY:
+		if ((scb_s->ipa & 0xf000) != 0xf000)
+			scb_s->ipa += 0x1000;
+		break;
+	}
+	return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+	if (vsie_page->gmap)
+		gmap_put(vsie_page->gmap);
+	WRITE_ONCE(vsie_page->gmap, NULL);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+			       struct vsie_page *vsie_page)
+{
+	unsigned long asce;
+	union ctlreg0 cr0;
+	struct gmap *gmap;
+	int edat;
+
+	asce = vcpu->arch.sie_block->gcr[1];
+	cr0.val = vcpu->arch.sie_block->gcr[0];
+	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+	edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+	if (IS_ERR(gmap))
+		return PTR_ERR(gmap);
+	gmap->private = vcpu->kvm;
+	WRITE_ONCE(vsie_page->gmap, gmap);
+	return 0;
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	int rc = 0;
+
+	while (1) {
+		rc = acquire_gmap_shadow(vcpu, vsie_page);
+		if (!rc)
+			rc = map_prefix(vcpu, vsie_page);
+		if (!rc) {
+			gmap_enable(vsie_page->gmap);
+			update_intervention_requests(vsie_page);
+			rc = do_vsie_run(vcpu, vsie_page);
+			gmap_enable(vcpu->arch.gmap);
+		}
+		release_gmap_shadow(vsie_page);
+
+		if (rc == -EAGAIN)
+			rc = 0;
+		if (rc || scb_s->icptcode || signal_pending(current) ||
+		    kvm_s390_vcpu_has_irq(vcpu, 0))
+			break;
+	};
+
+	if (rc == -EFAULT) {
+		/*
+		 * Addressing exceptions are always presentes as intercepts.
+		 * As addressing exceptions are suppressing and our guest 3 PSW
+		 * points at the responsible instruction, we have to
+		 * forward the PSW and set the ilc. If we can't read guest 3
+		 * instruction, we can use an arbitrary ilc. Let's always use
+		 * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+		 * memory. (we could also fake the shadow so the hardware
+		 * handles it).
+		 */
+		scb_s->icptcode = ICPT_PROGI;
+		scb_s->iprcc = PGM_ADDRESSING;
+		scb_s->pgmilc = 4;
+		scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+	}
+	return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ *          - NULL if the same scb address is already used by another VCPU
+ *          - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+	struct vsie_page *vsie_page;
+	struct page *page;
+	int nr_vcpus;
+
+	rcu_read_lock();
+	page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+	rcu_read_unlock();
+	if (page) {
+		if (page_ref_inc_return(page) == 2)
+			return page_to_virt(page);
+		page_ref_dec(page);
+	}
+
+	/*
+	 * We want at least #online_vcpus shadows, so every VCPU can execute
+	 * the VSIE in parallel.
+	 */
+	nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+	mutex_lock(&kvm->arch.vsie.mutex);
+	if (kvm->arch.vsie.page_count < nr_vcpus) {
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page) {
+			mutex_unlock(&kvm->arch.vsie.mutex);
+			return ERR_PTR(-ENOMEM);
+		}
+		page_ref_inc(page);
+		kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+		kvm->arch.vsie.page_count++;
+	} else {
+		/* reuse an existing entry that belongs to nobody */
+		while (true) {
+			page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+			if (page_ref_inc_return(page) == 2)
+				break;
+			page_ref_dec(page);
+			kvm->arch.vsie.next++;
+			kvm->arch.vsie.next %= nr_vcpus;
+		}
+		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+	}
+	page->index = addr;
+	/* double use of the same address */
+	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+		page_ref_dec(page);
+		mutex_unlock(&kvm->arch.vsie.mutex);
+		return NULL;
+	}
+	mutex_unlock(&kvm->arch.vsie.mutex);
+
+	vsie_page = page_to_virt(page);
+	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+	vsie_page->scb_s.ihcpu = 0xffffU;
+	return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+	struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+	page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+	struct vsie_page *vsie_page;
+	unsigned long scb_addr;
+	int rc;
+
+	vcpu->stat.instruction_sie++;
+	if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+		return -EOPNOTSUPP;
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+	scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+	/* 512 byte alignment */
+	if (unlikely(scb_addr & 0x1ffUL))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+		return 0;
+
+	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+	if (IS_ERR(vsie_page))
+		return PTR_ERR(vsie_page);
+	else if (!vsie_page)
+		/* double use of sie control block - simply do nothing */
+		return 0;
+
+	rc = pin_scb(vcpu, vsie_page, scb_addr);
+	if (rc)
+		goto out_put;
+	rc = shadow_scb(vcpu, vsie_page);
+	if (rc)
+		goto out_unpin_scb;
+	rc = pin_blocks(vcpu, vsie_page);
+	if (rc)
+		goto out_unshadow;
+	rc = vsie_run(vcpu, vsie_page);
+	unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+	unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+	unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+	put_vsie_page(vcpu->kvm, vsie_page);
+
+	return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+	mutex_init(&kvm->arch.vsie.mutex);
+	INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+	struct page *page;
+	int i;
+
+	mutex_lock(&kvm->arch.vsie.mutex);
+	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+		page = kvm->arch.vsie.pages[i];
+		kvm->arch.vsie.pages[i] = NULL;
+		/* free the radix tree entry */
+		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+		__free_page(page);
+	}
+	kvm->arch.vsie.page_count = 0;
+	mutex_unlock(&kvm->arch.vsie.mutex);
+}

From 06d68a6c85d95515533663ff002d06753fd772aa Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 22 Apr 2016 13:50:09 +0200
Subject: [PATCH 144/302] KVM: s390: vsie: optimize gmap prefix mapping

In order to not always map the prefix, we have to take care of certain
aspects that implicitly unmap the prefix:
- Changes to the prefix address
- Changes to MSO, because the HVA of the prefix is changed
- Changes of the gmap shadow (e.g. unshadowed, asce or edat changes)

By properly handling these cases, we can stop remapping the prefix when
there is no reason to do so.

This also allows us now to not acquire any gmap shadow locks when
rerunning the vsie and still having a valid gmap shadow.

Please note, to detect changing gmap shadows, we have to keep the reference
of the gmap shadow. The address of a gmap shadow does otherwise not
reliably indicate if the gmap shadow has changed (the memory chunk
could get reused).

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 747d4f9001555f..2839efcfc5ffec 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -62,6 +62,11 @@ static void prefix_mapped(struct vsie_page *vsie_page)
 	atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
 }
 
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+	return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
 
 /* copy the updated intervention request bits into the shadow scb */
 static void update_intervention_requests(struct vsie_page *vsie_page)
@@ -152,6 +157,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	unsigned long new_mso;
 	int rc;
 
 	/* make sure we don't have any leftovers when reusing the scb */
@@ -192,9 +198,13 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 	scb_s->icpua = scb_o->icpua;
 
+	new_mso = scb_o->mso & 0xfffffffffff00000UL;
+	/* if the hva of the prefix changes, we have to remap the prefix */
+	if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+		prefix_unmapped(vsie_page);
 	 /* SIE will do mso/msl validity and exception checks for us */
 	scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
-	scb_s->mso = scb_o->mso & 0xfffffffffff00000UL;
+	scb_s->mso = new_mso;
 	scb_s->prefix = scb_o->prefix;
 
 	/* We have to definetly flush the tlb if this scb never ran */
@@ -262,6 +272,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
 	int rc;
 
+	if (prefix_is_mapped(vsie_page))
+		return 0;
+
 	/* mark it as mapped so we can catch any concurrent unmappers */
 	prefix_mapped(vsie_page);
 
@@ -532,6 +545,7 @@ static void release_gmap_shadow(struct vsie_page *vsie_page)
 	if (vsie_page->gmap)
 		gmap_put(vsie_page->gmap);
 	WRITE_ONCE(vsie_page->gmap, NULL);
+	prefix_unmapped(vsie_page);
 }
 
 static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
@@ -547,6 +561,16 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
 	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
 	edat += edat && test_kvm_facility(vcpu->kvm, 78);
 
+	/*
+	 * ASCE or EDAT could have changed since last icpt, or the gmap
+	 * we're holding has been unshadowed. If the gmap is still valid,
+	 * we can safely reuse it.
+	 */
+	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+		return 0;
+
+	/* release the old shadow - if any, and mark the prefix as unmapped */
+	release_gmap_shadow(vsie_page);
 	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
 	if (IS_ERR(gmap))
 		return PTR_ERR(gmap);
@@ -578,7 +602,6 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			rc = do_vsie_run(vcpu, vsie_page);
 			gmap_enable(vcpu->arch.gmap);
 		}
-		release_gmap_shadow(vsie_page);
 
 		if (rc == -EAGAIN)
 			rc = 0;
@@ -667,6 +690,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 
 	vsie_page = page_to_virt(page);
 	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+	release_gmap_shadow(vsie_page);
 	vsie_page->scb_s.ihcpu = 0xffffU;
 	return vsie_page;
 }
@@ -739,6 +763,7 @@ void kvm_s390_vsie_init(struct kvm *kvm)
 /* Destroy the vsie data structures. To be called when a vm is destroyed. */
 void kvm_s390_vsie_destroy(struct kvm *kvm)
 {
+	struct vsie_page *vsie_page;
 	struct page *page;
 	int i;
 
@@ -746,6 +771,8 @@ void kvm_s390_vsie_destroy(struct kvm *kvm)
 	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
 		page = kvm->arch.vsie.pages[i];
 		kvm->arch.vsie.pages[i] = NULL;
+		vsie_page = page_to_virt(page);
+		release_gmap_shadow(vsie_page);
 		/* free the radix tree entry */
 		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
 		__free_page(page);

From 3573602b20b061030c34b04f206b781857f155df Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 19 Feb 2016 10:11:24 +0100
Subject: [PATCH 145/302] KVM: s390: vsie: support setting the ibc

As soon as we forward an ibc to guest 2 (indicated via
kvm->arch.model.ibc), he can also use it for guest 3. Let's properly round
the ibc up/down, so we avoid any potential validity icpts from the
underlying SIE, if it doesn't simply round the values.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 2839efcfc5ffec..1165baf7853532 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -102,6 +102,26 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	return 0;
 }
 
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	__u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+	scb_s->ibc = 0;
+	/* ibc installed in g2 and requested for g3 */
+	if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+		scb_s->ibc = scb_o->ibc & 0x0fffU;
+		/* takte care of the minimum ibc level of the machine */
+		if (scb_s->ibc < min_ibc)
+			scb_s->ibc = min_ibc;
+		/* take care of the maximum ibc level set for the guest */
+		if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+			scb_s->ibc = vcpu->kvm->arch.model.ibc;
+	}
+}
+
 /* unshadow the scb, copying parameters back to the real scb */
 static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
@@ -214,6 +234,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* MVPG and Protection Exception Interpretation are always available */
 	scb_s->eca |= scb_o->eca & 0x01002000U;
 
+	prepare_ibc(vcpu, vsie_page);
 out:
 	if (rc)
 		unshadow_scb(vcpu, vsie_page);

From 535ef81c6e7910c0205f58a69ed6c765f8ba7f18 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 12 Feb 2016 12:24:20 +0100
Subject: [PATCH 146/302] KVM: s390: vsie: support edat1 / edat2

If guest 2 is allowed to use edat 1 / edat 2, it can also set it up for
guest 3, so let's properly check and forward the edat cpuflags.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 1165baf7853532..7c9835b0a33f7b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -97,6 +97,13 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
 	/* intervention requests will be set later */
 	newflags = CPUSTAT_ZARCH;
+	if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+		newflags |= CPUSTAT_GED;
+	if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+		if (cpuflags & CPUSTAT_GED)
+			return set_validity_icpt(scb_s, 0x0001U);
+		newflags |= CPUSTAT_GED2;
+	}
 
 	atomic_set(&scb_s->cpuflags, newflags);
 	return 0;

From 4ceafa9027b0c2671ab731c7d95896a5b3c2dc0b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 27 Nov 2015 12:34:28 +0100
Subject: [PATCH 147/302] KVM: s390: vsie: support host-protection-interruption

Introduced with ESOP, therefore available for the guest if it
is allowed to use ESOP.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 7c9835b0a33f7b..aaed63ce29b23d 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -240,6 +240,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
 	/* MVPG and Protection Exception Interpretation are always available */
 	scb_s->eca |= scb_o->eca & 0x01002000U;
+	/* Host-protection-interruption introduced with ESOP */
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+		scb_s->ecb |= scb_o->ecb & 0x02U;
 
 	prepare_ibc(vcpu, vsie_page);
 out:

From 66b630d5b7f2d3afb5e8eddad3e8326091375f1a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 26 Nov 2015 14:11:19 +0100
Subject: [PATCH 148/302] KVM: s390: vsie: support STFLE interpretation

Issuing STFLE is extremely rare. Instead of copying 2k on every
VSIE call, let's do this lazily, when a guest 3 tries to execute
STFLE. We can setup the block and retry.

Unfortunately, we can't directly forward that facility list, as
we only have a 31 bit address for the facility list designation.
So let's use a DMA allocation for our vsie_page instead for now.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 49 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index aaed63ce29b23d..cd4bbfa72881dd 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -18,6 +18,7 @@
 #include <asm/mmu_context.h>
 #include <asm/sclp.h>
 #include <asm/nmi.h>
+#include <asm/dis.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -27,7 +28,8 @@ struct vsie_page {
 	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
 	/* the shadow gmap in use by the vsie_page */
 	struct gmap *gmap;			/* 0x0208 */
-	__u8 reserved[0x1000 - 0x0210];		/* 0x0210 */
+	__u8 reserved[0x0800 - 0x0210];		/* 0x0210 */
+	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 } __packed;
 
 /* trigger a validity icpt for the given scb */
@@ -194,6 +196,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	scb_s->ecb2 = 0;
 	scb_s->ecb3 = 0;
 	scb_s->ecd = 0;
+	scb_s->fac = 0;
 
 	rc = prepare_cpuflags(vcpu, vsie_page);
 	if (rc)
@@ -521,6 +524,44 @@ static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
 	vsie_page->scb_s.icptcode = 0;
 }
 
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	int ilen = insn_length(scb_s->ipa >> 8);
+
+	/* take care of EXECUTE instructions */
+	if (scb_s->icptstatus & 1) {
+		ilen = (scb_s->icptstatus >> 4) & 0x6;
+		if (!ilen)
+			ilen = 4;
+	}
+	scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+	clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ *          - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	__u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+	if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+		retry_vsie_icpt(vsie_page);
+		if (read_guest_real(vcpu, fac, &vsie_page->fac,
+				    sizeof(vsie_page->fac)))
+			return set_validity_icpt(scb_s, 0x1090U);
+		scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+	}
+	return 0;
+}
+
 /*
  * Run the vsie on a shadow scb and a shadow gmap, without any further
  * sanity checks, handling SIE faults.
@@ -558,6 +599,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		return handle_fault(vcpu, vsie_page);
 
 	switch (scb_s->icptcode) {
+	case ICPT_INST:
+		if (scb_s->ipa == 0xb2b0)
+			rc = handle_stfle(vcpu, vsie_page);
+		break;
 	case ICPT_STOP:
 		/* stop not requested by g2 - must have been a kick */
 		if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
@@ -690,7 +735,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 
 	mutex_lock(&kvm->arch.vsie.mutex);
 	if (kvm->arch.vsie.page_count < nr_vcpus) {
-		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
 		if (!page) {
 			mutex_unlock(&kvm->arch.vsie.mutex);
 			return ERR_PTR(-ENOMEM);

From bbeaa58b32ab627b68748543b3dcb98b9a28d570 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 26 Nov 2015 13:11:42 +0100
Subject: [PATCH 149/302] KVM: s390: vsie: support aes dea wrapping keys

As soon as message-security-assist extension 3 is enabled for guest 2,
we have to allow key wrapping for guest 3.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 56 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index cd4bbfa72881dd..6b26b0be63c14f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -28,7 +28,8 @@ struct vsie_page {
 	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
 	/* the shadow gmap in use by the vsie_page */
 	struct gmap *gmap;			/* 0x0208 */
-	__u8 reserved[0x0800 - 0x0210];		/* 0x0210 */
+	__u8 reserved[0x0700 - 0x0210];		/* 0x0210 */
+	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 } __packed;
 
@@ -111,6 +112,58 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	return 0;
 }
 
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+	unsigned long *b1, *b2;
+	u8 ecb3_flags;
+
+	scb_s->crycbd = 0;
+	if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+		return 0;
+	/* format-1 is supported with message-security-assist extension 3 */
+	if (!test_kvm_facility(vcpu->kvm, 76))
+		return 0;
+	/* we may only allow it if enabled for guest 2 */
+	ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+		     (ECB3_AES | ECB3_DEA);
+	if (!ecb3_flags)
+		return 0;
+
+	if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+		return set_validity_icpt(scb_s, 0x003CU);
+	else if (!crycb_addr)
+		return set_validity_icpt(scb_s, 0x0039U);
+
+	/* copy only the wrapping keys */
+	if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+		return set_validity_icpt(scb_s, 0x0035U);
+
+	scb_s->ecb3 |= ecb3_flags;
+	scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+			CRYCB_FORMAT2;
+
+	/* xor both blocks in one run */
+	b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+	b2 = (unsigned long *)
+			    vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+	/* as 56%8 == 0, bitmap_xor won't overwrite any data */
+	bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+	return 0;
+}
+
 /* shadow (round up/down) the ibc to avoid validity icpt */
 static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
@@ -248,6 +301,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->ecb |= scb_o->ecb & 0x02U;
 
 	prepare_ibc(vcpu, vsie_page);
+	rc = shadow_crycb(vcpu, vsie_page);
 out:
 	if (rc)
 		unshadow_scb(vcpu, vsie_page);

From 166ecb3d3cfecb62c31fdeab9949d70e84cd75cd Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 11:13:32 +0100
Subject: [PATCH 150/302] KVM: s390: vsie: support transactional execution

As soon as guest 2 is allowed to use transactional execution (indicated via
STFLE), he can also enable it for guest 3.

Active transactional execution requires also the second prefix page to be
mapped. If that page cannot be mapped, a validity icpt has to be presented
to the guest.

We have to take care of tx being toggled on/off, otherwise we might get
wrong prefix validity icpt.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 6b26b0be63c14f..4e2c71cced4830 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -239,6 +239,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	bool had_tx = scb_s->ecb & 0x10U;
 	unsigned long new_mso;
 	int rc;
 
@@ -299,6 +300,13 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* Host-protection-interruption introduced with ESOP */
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
 		scb_s->ecb |= scb_o->ecb & 0x02U;
+	/* transactional execution */
+	if (test_kvm_facility(vcpu->kvm, 73)) {
+		/* remap the prefix is tx is toggled on */
+		if ((scb_o->ecb & 0x10U) && !had_tx)
+			prefix_unmapped(vsie_page);
+		scb_s->ecb |= scb_o->ecb & 0x10U;
+	}
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);
@@ -337,13 +345,13 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
 		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
 		/* with mso/msl, the prefix lies at an offset */
 		prefix += cur->scb_s.mso;
-		if (prefix <= end && start <= prefix + PAGE_SIZE - 1)
+		if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
 			prefix_unmapped_sync(cur);
 	}
 }
 
 /*
- * Map the first prefix page.
+ * Map the first prefix page and if tx is enabled also the second prefix page.
  *
  * The prefix will be protected, a gmap notifier will inform about unmaps.
  * The shadow scb must not be executed until the prefix is remapped, this is
@@ -370,6 +378,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	prefix += scb_s->mso;
 
 	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+	if (!rc && (scb_s->ecb & 0x10U))
+		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+					   prefix + PAGE_SIZE);
 	/*
 	 * We don't have to mprotect, we will be called for all unshadows.
 	 * SIE will detect if protection applies and trigger a validity.
@@ -434,6 +445,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->scaol = 0;
 		scb_s->scaoh = 0;
 	}
+
+	hpa = scb_s->itdba;
+	if (hpa) {
+		gpa = scb_o->itdba & ~0xffUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->itdba = 0;
+	}
 }
 
 /*
@@ -477,6 +495,21 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->scaoh = (u32)((u64)hpa >> 32);
 		scb_s->scaol = (u32)(u64)hpa;
 	}
+
+	gpa = scb_o->itdba & ~0xffUL;
+	if (gpa && (scb_s->ecb & 0x10U)) {
+		if (!(gpa & ~0x1fffU)) {
+			rc = set_validity_icpt(scb_s, 0x0080U);
+			goto unpin;
+		}
+		/* 256 bytes cannot cross page boundaries */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x0080U);
+		if (rc)
+			goto unpin;
+		scb_s->itdba = hpa;
+	}
 	return 0;
 unpin:
 	unpin_blocks(vcpu, vsie_page);

From c9bc1eabe5ee49f1be68550cc0bd907b55d9da8d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 11:08:32 +0100
Subject: [PATCH 151/302] KVM: s390: vsie: support vectory facility (SIMD)

As soon as guest 2 is allowed to use the vector facility (indicated via
STFLE), it can also enable it for guest 3. We have to take care of the
sattellite block that might be used when not relying on lazy vector
copying (not the case for KVM).

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  2 +-
 arch/s390/kvm/vsie.c             | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 255609c8690113..190ad63291fb22 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -229,7 +229,7 @@ struct kvm_s390_sie_block {
 	__u8	reserved1e6[2];		/* 0x01e6 */
 	__u64	itdba;			/* 0x01e8 */
 	__u64   riccbd;			/* 0x01f0 */
-	__u8    reserved1f8[8];		/* 0x01f8 */
+	__u64	gvrd;			/* 0x01f8 */
 } __attribute__((packed));
 
 struct kvm_s390_itdb {
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4e2c71cced4830..6d9f4058ce1572 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -307,6 +307,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			prefix_unmapped(vsie_page);
 		scb_s->ecb |= scb_o->ecb & 0x10U;
 	}
+	/* SIMD */
+	if (test_kvm_facility(vcpu->kvm, 129)) {
+		scb_s->eca |= scb_o->eca & 0x00020000U;
+		scb_s->ecd |= scb_o->ecd & 0x20000000U;
+	}
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);
@@ -452,6 +457,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		unpin_guest_page(vcpu->kvm, gpa, hpa);
 		scb_s->itdba = 0;
 	}
+
+	hpa = scb_s->gvrd;
+	if (hpa) {
+		gpa = scb_o->gvrd & ~0x1ffUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->gvrd = 0;
+	}
 }
 
 /*
@@ -510,6 +522,25 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			goto unpin;
 		scb_s->itdba = hpa;
 	}
+
+	gpa = scb_o->gvrd & ~0x1ffUL;
+	if (gpa && (scb_s->eca & 0x00020000U) &&
+	    !(scb_s->ecd & 0x20000000U)) {
+		if (!(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x1310U);
+			goto unpin;
+		}
+		/*
+		 * 512 bytes vector registers cannot cross page boundaries
+		 * if this block gets bigger, we have to shadow it.
+		 */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x1310U);
+		if (rc)
+			goto unpin;
+		scb_s->gvrd = hpa;
+	}
 	return 0;
 unpin:
 	unpin_blocks(vcpu, vsie_page);

From 588438cba015ff3d14504b7598308dd3ebe06a99 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 26 Jan 2016 12:51:06 +0100
Subject: [PATCH 152/302] KVM: s390: vsie: support run-time-instrumentation

As soon as guest 2 is allowed to use run-time-instrumentation (indicated
via via STFLE), it can also enable it for guest 3.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 6d9f4058ce1572..ebc988ffd3e50c 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -312,6 +312,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->eca |= scb_o->eca & 0x00020000U;
 		scb_s->ecd |= scb_o->ecd & 0x20000000U;
 	}
+	/* Run-time-Instrumentation */
+	if (test_kvm_facility(vcpu->kvm, 64))
+		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);
@@ -464,6 +467,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		unpin_guest_page(vcpu->kvm, gpa, hpa);
 		scb_s->gvrd = 0;
 	}
+
+	hpa = scb_s->riccbd;
+	if (hpa) {
+		gpa = scb_o->riccbd & ~0x3fUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->riccbd = 0;
+	}
 }
 
 /*
@@ -541,6 +551,22 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			goto unpin;
 		scb_s->gvrd = hpa;
 	}
+
+	gpa = scb_o->riccbd & ~0x3fUL;
+	if (gpa && (scb_s->ecb3 & 0x01U)) {
+		if (!(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x0043U);
+			goto unpin;
+		}
+		/* 64 bytes cannot cross page boundaries */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x0043U);
+		/* Validity 0x0044 will be checked by SIE */
+		if (rc)
+			goto unpin;
+		scb_s->gvrd = hpa;
+	}
 	return 0;
 unpin:
 	unpin_blocks(vcpu, vsie_page);

From 19c439b564b05939b83876a687bd48389d0aebb5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 11:02:26 +0100
Subject: [PATCH 153/302] KVM: s390: vsie: support 64-bit-SCAO

Let's provide the 64-bit-SCAO facility to guest 2, so he can set up a SCA
for guest 3 that has a 64 bit address. Please note that we already require
the 64 bit SCAO for our vsie implementation, in order to forward the SCA
directly (by pinning the page).

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h | 1 +
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 4 ++++
 3 files changed, 7 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 62423b1931c002..c5e4537e96d92e 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -99,6 +99,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_NR_BITS	1024
 #define KVM_S390_VM_CPU_FEAT_ESOP	0
 #define KVM_S390_VM_CPU_FEAT_SIEF2	1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO	2
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3fb124226e9796..e0c5a57bf58b93 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -265,6 +265,8 @@ static void kvm_s390_cpu_feat_init(void)
 	    !test_facility(3))
 		return;
 	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+	if (sclp.has_64bscao)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
 }
 
 int kvm_arch_init(void *opaque)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index ebc988ffd3e50c..44e66c32902619 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -449,6 +449,8 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
 	if (hpa) {
 		gpa = scb_o->scaol & ~0xfUL;
+		if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+			gpa |= (u64) scb_o->scaoh << 32;
 		unpin_guest_page(vcpu->kvm, gpa, hpa);
 		scb_s->scaol = 0;
 		scb_s->scaoh = 0;
@@ -499,6 +501,8 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	int rc = 0;
 
 	gpa = scb_o->scaol & ~0xfUL;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+		gpa |= (u64) scb_o->scaoh << 32;
 	if (gpa) {
 		if (!(gpa & ~0x1fffUL))
 			rc = set_validity_icpt(scb_s, 0x0038U);

From 0615a326e066b580cf26d16a092ea54997dd6cbb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 09:59:49 +0100
Subject: [PATCH 154/302] KVM: s390: vsie: support shared IPTE-interlock
 facility

As we forward the whole SCA provided by guest 2, we can directly forward
SIIF if available.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h | 1 +
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index c5e4537e96d92e..1d2e820f763d40 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -100,6 +100,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_ESOP	0
 #define KVM_S390_VM_CPU_FEAT_SIEF2	1
 #define KVM_S390_VM_CPU_FEAT_64BSCAO	2
+#define KVM_S390_VM_CPU_FEAT_SIIF	3
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e0c5a57bf58b93..d735612f908101 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -267,6 +267,8 @@ static void kvm_s390_cpu_feat_init(void)
 	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
 	if (sclp.has_64bscao)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+	if (sclp.has_siif)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
 }
 
 int kvm_arch_init(void *opaque)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 44e66c32902619..1615ed37f7dad0 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -315,6 +315,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* Run-time-Instrumentation */
 	if (test_kvm_facility(vcpu->kvm, 64))
 		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+		scb_s->eca |= scb_o->eca & 0x00000001U;
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);

From 77d18f6d47fbeaaceb15df9ab928757d5bb96ec6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 16:32:35 +0100
Subject: [PATCH 155/302] KVM: s390: vsie: support guest-PER-enhancement

We can easily forward the guest-PER-enhancement facility to guest 2 if
available.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h | 1 +
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 1d2e820f763d40..98526ac114bfd7 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -101,6 +101,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_SIEF2	1
 #define KVM_S390_VM_CPU_FEAT_64BSCAO	2
 #define KVM_S390_VM_CPU_FEAT_SIIF	3
+#define KVM_S390_VM_CPU_FEAT_GPERE	4
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d735612f908101..175752877c0d73 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -269,6 +269,8 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
 	if (sclp.has_siif)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+	if (sclp.has_gpere)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
 }
 
 int kvm_arch_init(void *opaque)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 1615ed37f7dad0..b8792ef0103077 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -107,6 +107,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			return set_validity_icpt(scb_s, 0x0001U);
 		newflags |= CPUSTAT_GED2;
 	}
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+		newflags |= cpuflags & CPUSTAT_P;
 
 	atomic_set(&scb_s->cpuflags, newflags);
 	return 0;

From a1b7b9b286c0157748922526ecb353e550209833 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 16:41:33 +0100
Subject: [PATCH 156/302] KVM: s390: vsie: support
 guest-storage-limit-suppression

We can easily forward guest-storage-limit-suppression if available.

One thing to care about is keeping the prefix properly mapped when
gsls in toggled on/off or the mso changes in between. Therefore we better
remap the prefix on any mso changes just like we already do with the
prefix.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h | 1 +
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 7 +++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 98526ac114bfd7..9ed07479714f01 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -102,6 +102,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_64BSCAO	2
 #define KVM_S390_VM_CPU_FEAT_SIIF	3
 #define KVM_S390_VM_CPU_FEAT_GPERE	4
+#define KVM_S390_VM_CPU_FEAT_GSLS	5
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 175752877c0d73..ce9813afd50280 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -271,6 +271,8 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
 	if (sclp.has_gpere)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+	if (sclp.has_gsls)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
 }
 
 int kvm_arch_init(void *opaque)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b8792ef0103077..ea65bf2f02011c 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -109,6 +109,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	}
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
 		newflags |= cpuflags & CPUSTAT_P;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+		newflags |= cpuflags & CPUSTAT_SM;
 
 	atomic_set(&scb_s->cpuflags, newflags);
 	return 0;
@@ -242,7 +244,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	bool had_tx = scb_s->ecb & 0x10U;
-	unsigned long new_mso;
+	unsigned long new_mso = 0;
 	int rc;
 
 	/* make sure we don't have any leftovers when reusing the scb */
@@ -284,7 +286,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 	scb_s->icpua = scb_o->icpua;
 
-	new_mso = scb_o->mso & 0xfffffffffff00000UL;
+	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+		new_mso = scb_o->mso & 0xfffffffffff00000UL;
 	/* if the hva of the prefix changes, we have to remap the prefix */
 	if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
 		prefix_unmapped(vsie_page);

From 5630a8e82b1ee4d13daa500c045603c5b4801fd9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 16:53:51 +0100
Subject: [PATCH 157/302] KVM: s390: vsie: support intervention-bypass

We can easily enable intervention bypass for guest 2, so it can use it
for guest 3.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h | 1 +
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 9ed07479714f01..347f4f61b65650 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -103,6 +103,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_SIIF	3
 #define KVM_S390_VM_CPU_FEAT_GPERE	4
 #define KVM_S390_VM_CPU_FEAT_GSLS	5
+#define KVM_S390_VM_CPU_FEAT_IB		6
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ce9813afd50280..5ec598ca7660a4 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -273,6 +273,8 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
 	if (sclp.has_gsls)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+	if (sclp.has_ib)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
 }
 
 int kvm_arch_init(void *opaque)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index ea65bf2f02011c..d29bd592fb3d6e 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -322,6 +322,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
 		scb_s->eca |= scb_o->eca & 0x00000001U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+		scb_s->eca |= scb_o->eca & 0x40000000U;
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);

From 13ee3f678b1117d7511a2c5e10549f7c37f4cadf Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 16:54:37 +0100
Subject: [PATCH 158/302] KVM: s390: vsie: support
 conditional-external-interception

We can easily enable cei for guest 2, so he can use it for guest 3.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h | 1 +
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 347f4f61b65650..7630dd70ed5704 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -104,6 +104,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_GPERE	4
 #define KVM_S390_VM_CPU_FEAT_GSLS	5
 #define KVM_S390_VM_CPU_FEAT_IB		6
+#define KVM_S390_VM_CPU_FEAT_CEI	7
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5ec598ca7660a4..1c1188ba104254 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -275,6 +275,8 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
 	if (sclp.has_ib)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+	if (sclp.has_cei)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
 }
 
 int kvm_arch_init(void *opaque)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index d29bd592fb3d6e..f3a4a0bad4a7e8 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -324,6 +324,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->eca |= scb_o->eca & 0x00000001U;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
 		scb_s->eca |= scb_o->eca & 0x40000000U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+		scb_s->eca |= scb_o->eca & 0x80000000U;
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);

From 7fd7f39daa3da822122124730437c4f37e4d82de Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 16:56:23 +0100
Subject: [PATCH 159/302] KVM: s390: vsie: support IBS interpretation

We can easily enable ibs for guest 2, so he can use it for guest 3.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h | 1 +
 arch/s390/kvm/kvm-s390.c         | 2 ++
 arch/s390/kvm/vsie.c             | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 7630dd70ed5704..c128567d1cd3c5 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -105,6 +105,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_GSLS	5
 #define KVM_S390_VM_CPU_FEAT_IB		6
 #define KVM_S390_VM_CPU_FEAT_CEI	7
+#define KVM_S390_VM_CPU_FEAT_IBS	8
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1c1188ba104254..8ba7a98a50cfbf 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -277,6 +277,8 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
 	if (sclp.has_cei)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+	if (sclp.has_ibs)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
 }
 
 int kvm_arch_init(void *opaque)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index f3a4a0bad4a7e8..3ececbbd6bb084 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -111,6 +111,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		newflags |= cpuflags & CPUSTAT_P;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
 		newflags |= cpuflags & CPUSTAT_SM;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+		newflags |= cpuflags & CPUSTAT_IBS;
 
 	atomic_set(&scb_s->cpuflags, newflags);
 	return 0;

From 1b7029bec18718eca8cfc5c1c0917444f019be1e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 8 Jul 2015 13:25:31 +0200
Subject: [PATCH 160/302] KVM: s390: vsie: try to refault after a reported
 fault to g2

We can avoid one unneeded SIE entry after we reported a fault to g2.
Theoretically, g2 resolves the fault and we can create the shadow mapping
directly, instead of failing again when entering the SIE.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 3ececbbd6bb084..7482488d21d0a0 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -28,7 +28,9 @@ struct vsie_page {
 	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
 	/* the shadow gmap in use by the vsie_page */
 	struct gmap *gmap;			/* 0x0208 */
-	__u8 reserved[0x0700 - 0x0210];		/* 0x0210 */
+	/* address of the last reported fault to guest2 */
+	unsigned long fault_addr;		/* 0x0210 */
+	__u8 reserved[0x0700 - 0x0218];		/* 0x0218 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 } __packed;
@@ -676,10 +678,27 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		rc = inject_fault(vcpu, rc,
 				  current->thread.gmap_addr,
 				  current->thread.gmap_write_flag);
+		if (rc >= 0)
+			vsie_page->fault_addr = current->thread.gmap_addr;
 	}
 	return rc;
 }
 
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+			      struct vsie_page *vsie_page)
+{
+	if (vsie_page->fault_addr)
+		kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+				      vsie_page->fault_addr);
+	vsie_page->fault_addr = 0;
+}
+
 static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
 {
 	vsie_page->scb_s.icptcode = 0;
@@ -737,6 +756,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	int rc;
 
+	handle_last_fault(vcpu, vsie_page);
+
 	if (need_resched())
 		schedule();
 	if (test_cpu_flag(CIF_MCCK_PENDING))
@@ -928,6 +949,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 	vsie_page = page_to_virt(page);
 	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
 	release_gmap_shadow(vsie_page);
+	vsie_page->fault_addr = 0;
 	vsie_page->scb_s.ihcpu = 0xffffU;
 	return vsie_page;
 }

From adbf16985c387851fd3454ca34893705dbde7f98 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Fri, 27 May 2016 22:03:52 +0200
Subject: [PATCH 161/302] KVM: s390: vsie: speed up VCPU irq delivery when
 handling vsie

Whenever we want to wake up a VCPU (e.g. when injecting an IRQ), we
have to kick it out of vsie, so the request will be handled faster.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  2 ++
 arch/s390/kvm/interrupt.c        |  5 +++++
 arch/s390/kvm/kvm-s390.h         |  1 +
 arch/s390/kvm/vsie.c             | 35 ++++++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 190ad63291fb22..946fc86202fdac 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -549,6 +549,8 @@ struct kvm_guestdbg_info_arch {
 
 struct kvm_vcpu_arch {
 	struct kvm_s390_sie_block *sie_block;
+	/* if vsie is active, currently executed shadow sie control block */
+	struct kvm_s390_sie_block *vsie_block;
 	unsigned int      host_acrs[NUM_ACRS];
 	struct fpu	  host_fpregs;
 	struct kvm_s390_local_interrupt local_int;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d72c4a877622af..ca19627779db09 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -995,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 		swake_up(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
+	/*
+	 * The VCPU might not be sleeping but is executing the VSIE. Let's
+	 * kick it, so it leaves the SIE to process the request.
+	 */
+	kvm_s390_vsie_kick(vcpu);
 }
 
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index b137fbaac91cd8..ffbbdd28538522 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -254,6 +254,7 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
 /* implemented in vsie.c */
 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
 void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
 				 unsigned long end);
 void kvm_s390_vsie_init(struct kvm *kvm);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 7482488d21d0a0..c8c8763e78229b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -837,6 +837,23 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+				struct vsie_page *vsie_page)
+{
+	WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+	WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
 /*
  * Run the vsie on a shadowed scb, managing the gmap shadow, handling
  * prefix pages and faults.
@@ -860,6 +877,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			rc = do_vsie_run(vcpu, vsie_page);
 			gmap_enable(vcpu->arch.gmap);
 		}
+		atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
 
 		if (rc == -EAGAIN)
 			rc = 0;
@@ -1000,7 +1018,9 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
 	rc = pin_blocks(vcpu, vsie_page);
 	if (rc)
 		goto out_unshadow;
+	register_shadow_scb(vcpu, vsie_page);
 	rc = vsie_run(vcpu, vsie_page);
+	unregister_shadow_scb(vcpu);
 	unpin_blocks(vcpu, vsie_page);
 out_unshadow:
 	unshadow_scb(vcpu, vsie_page);
@@ -1039,3 +1059,18 @@ void kvm_s390_vsie_destroy(struct kvm *kvm)
 	kvm->arch.vsie.page_count = 0;
 	mutex_unlock(&kvm->arch.vsie.mutex);
 }
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+	/*
+	 * Even if the VCPU lets go of the shadow sie block reference, it is
+	 * still valid in the cache. So we can safely kick it.
+	 */
+	if (scb) {
+		atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+		if (scb->prog0c & PROG_IN_SIE)
+			atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+	}
+}

From 94a15de8fb2667791d66c49610676ea2add90034 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 18 Feb 2016 10:15:43 +0100
Subject: [PATCH 162/302] KVM: s390: don't use CPUSTAT_WAIT to detect if a VCPU
 is idle

As we want to make use of CPUSTAT_WAIT also when a VCPU is not idle but
to force interception of external calls, let's check in the bitmap instead.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.h |  2 +-
 arch/s390/kvm/sigp.c     | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index ffbbdd28538522..031f451bb2cf79 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+	return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 28ea0cab1f1b50..1a252f5370818e 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
 	const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
 	u16 p_asn, s_asn;
 	psw_t *psw;
-	u32 flags;
+	bool idle;
 
-	flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+	idle = is_vcpu_idle(vcpu);
 	psw = &dst_vcpu->arch.sie_block->gpsw;
 	p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff;  /* Primary ASN */
 	s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff;  /* Secondary ASN */
 
 	/* Inject the emergency signal? */
-	if (!(flags & CPUSTAT_STOPPED)
+	if (!is_vcpu_stopped(vcpu)
 	    || (psw->mask & psw_int_mask) != psw_int_mask
-	    || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
-	    || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+	    || (idle && psw->addr != 0)
+	    || (!idle && (asn == p_asn || asn == s_asn))) {
 		return __inject_sigp_emergency(vcpu, dst_vcpu);
 	} else {
 		*reg &= 0xffffffff00000000UL;

From b917ae573f5b3f7fee8cfb0d42d74bd8641f6401 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 7 Jul 2015 20:39:35 +0200
Subject: [PATCH 163/302] KVM: s390: vsie: speed up VCPU external calls

Whenever a SIGP external call is injected via the SIGP external call
interpretation facility, the VCPU is not kicked. When a VCPU is currently
in the VSIE, the external call might not be processed immediately.

Therefore we have to provoke partial execution exceptions, which leads to a
kick of the VCPU and therefore also kick out of VSIE. This is done by
simulating the WAIT state. This bit has no other side effects.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index c8c8763e78229b..90781ba5280305 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -844,6 +844,11 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu,
 				struct vsie_page *vsie_page)
 {
 	WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+	/*
+	 * External calls have to lead to a kick of the vcpu and
+	 * therefore the vsie -> Simulate Wait state.
+	 */
+	atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
 }
 
 /*
@@ -851,6 +856,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu,
  */
 static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
 {
+	atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
 	WRITE_ONCE(vcpu->arch.vsie_block, NULL);
 }
 

From 91473b487dd58af6384c5c3db13de50defa2c106 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 29 Oct 2015 10:30:36 +0100
Subject: [PATCH 164/302] KVM: s390: vsie: correctly set and handle guest TOD

Guest 2 sets up the epoch of guest 3 from his point of view. Therefore,
we have to add the guest 2 epoch to the guest 3 epoch. We also have to take
care of guest 2 epoch changes on STP syncs. This will work just fine by
also updating the guest 3 epoch when a vsie_block has been set for a VCPU.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 2 ++
 arch/s390/kvm/vsie.c     | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8ba7a98a50cfbf..6fdf1f7647d7c9 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -176,6 +176,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
 			vcpu->arch.sie_block->epoch -= *delta;
 			if (vcpu->arch.cputm_enabled)
 				vcpu->arch.cputm_start += *delta;
+			if (vcpu->arch.vsie_block)
+				vcpu->arch.vsie_block->epoch -= *delta;
 		}
 	}
 	return NOTIFY_OK;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 90781ba5280305..6895e7b3be1237 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -843,12 +843,21 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
 static void register_shadow_scb(struct kvm_vcpu *vcpu,
 				struct vsie_page *vsie_page)
 {
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
 	WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
 	/*
 	 * External calls have to lead to a kick of the vcpu and
 	 * therefore the vsie -> Simulate Wait state.
 	 */
 	atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	/*
+	 * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+	 * automatically be adjusted on tod clock changes via kvm_sync_clock.
+	 */
+	preempt_disable();
+	scb_s->epoch += vcpu->kvm->arch.epoch;
+	preempt_enable();
 }
 
 /*

From 5d3876a8bf4607b72cbe754278d19c68990b57a8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 13 Apr 2016 17:06:50 +0200
Subject: [PATCH 165/302] KVM: s390: vsie: add indication for future features

We have certain SIE features that we cannot support for now.
Let's add these features, so user space can directly prepare to enable
them, so we don't have to update yet another component.

In addition, add a comment block, telling why it is for now not possible to
forward/enable these features.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/uapi/asm/kvm.h |  4 ++++
 arch/s390/kvm/kvm-s390.c         | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index c128567d1cd3c5..a2ffec4139ad1c 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -106,6 +106,10 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_IB		6
 #define KVM_S390_VM_CPU_FEAT_CEI	7
 #define KVM_S390_VM_CPU_FEAT_IBS	8
+#define KVM_S390_VM_CPU_FEAT_SKEY	9
+#define KVM_S390_VM_CPU_FEAT_CMMA	10
+#define KVM_S390_VM_CPU_FEAT_PFMFI	11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF	12
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6fdf1f7647d7c9..31cf22f7d846bd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -281,6 +281,24 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
 	if (sclp.has_ibs)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+	/*
+	 * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+	 * all skey handling functions read/set the skey from the PGSTE
+	 * instead of the real storage key.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+	 * pages being detected as preserved although they are resident.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+	 * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+	 *
+	 * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+	 * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+	 * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+	 * cannot easily shadow the SCA because of the ipte lock.
+	 */
 }
 
 int kvm_arch_init(void *opaque)

From a411edf1320ed8fa3b4560902ac4e033c4a72bcf Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 2 Feb 2016 15:41:22 +0100
Subject: [PATCH 166/302] KVM: s390: vsie: add module parameter "nested"

Let's be careful first and allow nested virtualization only if enabled
by the system administrator. In addition, user space still has to
explicitly enable it via SCLP features for it to work.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 31cf22f7d846bd..03eeeb0ded2470 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -125,6 +125,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[16] = {
 	0xffe6000000000000UL,
@@ -264,7 +269,7 @@ static void kvm_s390_cpu_feat_init(void)
 	 * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
 	 */
 	if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
-	    !test_facility(3))
+	    !test_facility(3) || !nested)
 		return;
 	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
 	if (sclp.has_64bscao)

From 3b84080b9512bcacad3805f345fb8f092c8d3a7d Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Wed, 22 Jun 2016 14:59:54 +0800
Subject: [PATCH 167/302] KVM: VMX: move msr_ia32_feature_control to vcpu_vmx

msr_ia32_feature_control will be used for LMCE and not depend only on
nested anymore, so move it from struct nested_vmx to struct vcpu_vmx.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e185649fb8b72c..ad66978281c1b6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -428,7 +428,6 @@ struct nested_vmx {
 	struct pi_desc *pi_desc;
 	bool pi_pending;
 	u16 posted_intr_nv;
-	u64 msr_ia32_feature_control;
 
 	struct hrtimer preemption_timer;
 	bool preemption_timer_expired;
@@ -612,6 +611,8 @@ struct vcpu_vmx {
 	bool guest_pkru_valid;
 	u32 guest_pkru;
 	u32 host_pkru;
+
+	u64 msr_ia32_feature_control;
 };
 
 enum segment_cache_field {
@@ -2970,9 +2971,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
 	case MSR_IA32_FEATURE_CONTROL:
-		if (!nested_vmx_allowed(vcpu))
-			return 1;
-		msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+		msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
 		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!nested_vmx_allowed(vcpu))
@@ -3064,10 +3063,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_FEATURE_CONTROL:
 		if (!nested_vmx_allowed(vcpu) ||
-		    (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+		    (to_vmx(vcpu)->msr_ia32_feature_control &
 		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
 			return 1;
-		vmx->nested.msr_ia32_feature_control = data;
+		vmx->msr_ia32_feature_control = data;
 		if (msr_info->host_initiated && data == 0)
 			vmx_leave_nested(vcpu);
 		break;
@@ -6939,7 +6938,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
 			!= VMXON_NEEDED_FEATURES) {
 		kvm_inject_gp(vcpu, 0);
 		return 1;

From 37e4c997dadf713d5b9cb88a801eb38d61a2aefc Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Wed, 22 Jun 2016 14:59:55 +0800
Subject: [PATCH 168/302] KVM: VMX: validate individual bits of guest
 MSR_IA32_FEATURE_CONTROL

KVM currently does not check the value written to guest
MSR_IA32_FEATURE_CONTROL, though bits corresponding to disabled features
may be set. This patch makes KVM to validate individual bits written to
guest MSR_IA32_FEATURE_CONTROL according to enabled features.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad66978281c1b6..0a3ccb073bb452 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -612,7 +612,13 @@ struct vcpu_vmx {
 	u32 guest_pkru;
 	u32 host_pkru;
 
+	/*
+	 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+	 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+	 * in msr_ia32_feature_control_valid_bits.
+	 */
 	u64 msr_ia32_feature_control;
+	u64 msr_ia32_feature_control_valid_bits;
 };
 
 enum segment_cache_field {
@@ -2929,6 +2935,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	return 0;
 }
 
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+						 uint64_t val)
+{
+	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+	return !(val & ~valid_bits);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -3062,7 +3076,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
 	case MSR_IA32_FEATURE_CONTROL:
-		if (!nested_vmx_allowed(vcpu) ||
+		if (!vmx_feature_control_msr_valid(vcpu, data) ||
 		    (to_vmx(vcpu)->msr_ia32_feature_control &
 		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
 			return 1;
@@ -9055,6 +9069,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -9202,6 +9218,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 			vmx->nested.nested_vmx_secondary_ctls_high &=
 				~SECONDARY_EXEC_PCOMMIT;
 	}
+
+	if (nested_vmx_allowed(vcpu))
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+			FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	else
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+			~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)

From c45dcc71b794b5a346a43ad83bdcfac2138f0a2c Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Wed, 22 Jun 2016 14:59:56 +0800
Subject: [PATCH 169/302] KVM: VMX: enable guest access to LMCE related MSRs

On Intel platforms, this patch adds LMCE to KVM MCE supported
capabilities and handles guest access to LMCE related MSRs.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
[Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported
           Only enable LMCE on Intel platform
           Check MSR_IA32_FEATURE_CONTROL when handling guest
             access to MSR_IA32_MCG_EXT_CTL]
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++++
 arch/x86/kvm/vmx.c              | 29 +++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 15 +++++++++------
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 360c5171ea1a89..7a628fb6a2c2b0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -598,6 +598,7 @@ struct kvm_vcpu_arch {
 	u64 mcg_cap;
 	u64 mcg_status;
 	u64 mcg_ctl;
+	u64 mcg_ext_ctl;
 	u64 *mce_banks;
 
 	/* Cache MMIO info */
@@ -1008,6 +1009,8 @@ struct kvm_x86_ops {
 
 	int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
 	void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+	void (*setup_mce)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -1082,6 +1085,8 @@ extern u64  kvm_max_tsc_scaling_ratio;
 /* 1ull << kvm_tsc_scaling_ratio_frac_bits */
 extern u64  kvm_default_tsc_scaling_ratio;
 
+extern u64 kvm_mce_cap_supported;
+
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
 	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0a3ccb073bb452..943609f06c90c0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2984,6 +2984,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
+	case MSR_IA32_MCG_EXT_CTL:
+		if (!msr_info->host_initiated &&
+		    !(to_vmx(vcpu)->msr_ia32_feature_control &
+		      FEATURE_CONTROL_LMCE))
+			return 1;
+		msr_info->data = vcpu->arch.mcg_ext_ctl;
+		break;
 	case MSR_IA32_FEATURE_CONTROL:
 		msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
 		break;
@@ -3075,6 +3082,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC_ADJUST:
 		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
+	case MSR_IA32_MCG_EXT_CTL:
+		if ((!msr_info->host_initiated &&
+		     !(to_vmx(vcpu)->msr_ia32_feature_control &
+		       FEATURE_CONTROL_LMCE)) ||
+		    (data & ~MCG_EXT_CTL_LMCE_EN))
+			return 1;
+		vcpu->arch.mcg_ext_ctl = data;
+		break;
 	case MSR_IA32_FEATURE_CONTROL:
 		if (!vmx_feature_control_msr_valid(vcpu, data) ||
 		    (to_vmx(vcpu)->msr_ia32_feature_control &
@@ -6484,6 +6499,8 @@ static __init int hardware_setup(void)
 
 	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
+	kvm_mce_cap_supported |= MCG_LMCE_P;
+
 	return alloc_kvm_area();
 
 out8:
@@ -11109,6 +11126,16 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 	return ret;
 }
 
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+			FEATURE_CONTROL_LMCE;
+	else
+		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+			~FEATURE_CONTROL_LMCE;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -11238,6 +11265,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_hv_timer = vmx_set_hv_timer,
 	.cancel_hv_timer = vmx_cancel_hv_timer,
 #endif
+
+	.setup_mce = vmx_setup_mce,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 299219630c9470..0a42fc729ff39c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -70,7 +70,8 @@
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
 
 #define emul_to_vcpu(ctxt) \
 	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -984,6 +985,7 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
+	MSR_IA32_MCG_EXT_CTL,
 	MSR_IA32_SMBASE,
 };
 
@@ -2685,11 +2687,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
-		u64 mce_cap;
-
-		mce_cap = KVM_MCE_CAP_SUPPORTED;
 		r = -EFAULT;
-		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+		if (copy_to_user(argp, &kvm_mce_cap_supported,
+				 sizeof(kvm_mce_cap_supported)))
 			goto out;
 		r = 0;
 		break;
@@ -2872,7 +2872,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
-	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
 		goto out;
 	r = 0;
 	vcpu->arch.mcg_cap = mcg_cap;
@@ -2882,6 +2882,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	/* Init IA32_MCi_CTL to all 1s */
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+	if (kvm_x86_ops->setup_mce)
+		kvm_x86_ops->setup_mce(vcpu);
 out:
 	return r;
 }

From 87aeb54f1b9891cf08b84b3f0c34f220a4977c4f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 17:48:56 +0200
Subject: [PATCH 170/302] kvm: x86: use getboottime64

KVM reads the current boottime value as a struct timespec in order to
calculate the guest wallclock time, resulting in an overflow in 2038
on 32-bit systems.

The data then gets passed as an unsigned 32-bit number to the guest,
and that in turn overflows in 2106.

We cannot do much about the second overflow, which affects both 32-bit
and 64-bit hosts, but we can ensure that they both behave the same
way and don't overflow until 2106, by using getboottime64() to read
a timespec64 value.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a42fc729ff39c..9e50e2ad6d0829 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1165,7 +1165,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 	int version;
 	int r;
 	struct pvclock_wall_clock wc;
-	struct timespec boot;
+	struct timespec64 boot;
 
 	if (!wall_clock)
 		return;
@@ -1188,13 +1188,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 	 * wall clock specified here.  guest system time equals host
 	 * system time for us, thus we must fill in host boot time here.
 	 */
-	getboottime(&boot);
+	getboottime64(&boot);
 
 	if (kvm->arch.kvmclock_offset) {
-		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
-		boot = timespec_sub(boot, ts);
+		struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+		boot = timespec64_sub(boot, ts);
 	}
-	wc.sec = boot.tv_sec;
+	wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
 	wc.nsec = boot.tv_nsec;
 	wc.version = version;
 

From fb6cec1492d6a693d33224487061e92d0275c6e2 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 17 Jun 2016 18:19:40 +0100
Subject: [PATCH 171/302] MIPS: KVM: Combine entry trace events into class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Combine the kvm_enter, kvm_reenter and kvm_out trace events into a
single kvm_transition event class to reduce duplication and bloat.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Fixes: 93258604ab6d ("MIPS: KVM: Add guest mode switch trace events")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/trace.h | 62 +++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index a38bdab685745e..c858cf16807840 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -20,50 +20,32 @@
 /*
  * Tracepoints for VM enters
  */
-TRACE_EVENT(kvm_enter,
-	    TP_PROTO(struct kvm_vcpu *vcpu),
-	    TP_ARGS(vcpu),
-	    TP_STRUCT__entry(
-			__field(unsigned long, pc)
-	    ),
-
-	    TP_fast_assign(
-			__entry->pc = vcpu->arch.pc;
-	    ),
-
-	    TP_printk("PC: 0x%08lx",
-		      __entry->pc)
-);
-
-TRACE_EVENT(kvm_reenter,
-	    TP_PROTO(struct kvm_vcpu *vcpu),
-	    TP_ARGS(vcpu),
-	    TP_STRUCT__entry(
-			__field(unsigned long, pc)
-	    ),
-
-	    TP_fast_assign(
-			__entry->pc = vcpu->arch.pc;
-	    ),
-
-	    TP_printk("PC: 0x%08lx",
-		      __entry->pc)
+DECLARE_EVENT_CLASS(kvm_transition,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+	TP_STRUCT__entry(
+		__field(unsigned long, pc)
+	),
+
+	TP_fast_assign(
+		__entry->pc = vcpu->arch.pc;
+	),
+
+	TP_printk("PC: 0x%08lx",
+		  __entry->pc)
 );
 
-TRACE_EVENT(kvm_out,
-	    TP_PROTO(struct kvm_vcpu *vcpu),
-	    TP_ARGS(vcpu),
-	    TP_STRUCT__entry(
-			__field(unsigned long, pc)
-	    ),
+DEFINE_EVENT(kvm_transition, kvm_enter,
+	     TP_PROTO(struct kvm_vcpu *vcpu),
+	     TP_ARGS(vcpu));
 
-	    TP_fast_assign(
-			__entry->pc = vcpu->arch.pc;
-	    ),
+DEFINE_EVENT(kvm_transition, kvm_reenter,
+	     TP_PROTO(struct kvm_vcpu *vcpu),
+	     TP_ARGS(vcpu));
 
-	    TP_printk("PC: 0x%08lx",
-		      __entry->pc)
-);
+DEFINE_EVENT(kvm_transition, kvm_out,
+	     TP_PROTO(struct kvm_vcpu *vcpu),
+	     TP_ARGS(vcpu));
 
 /* The first 32 exit reasons correspond to Cause.ExcCode */
 #define KVM_TRACE_EXIT_INT		 0

From ebaac1736245e78109cd47d453a86a18dcfc94b8 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 15 Jun 2016 15:09:28 +0200
Subject: [PATCH 172/302] context_tracking: move rcu_virt_note_context_switch
 out of kvm_host.h

Make kvm_guest_{enter,exit} and __kvm_guest_{enter,exit} trivial wrappers
around the code in context_tracking.h.  Name the context_tracking.h functions
consistently with what those for kernel<->user switch.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/context_tracking.h | 38 ++++++++++++++++++++++++++++----
 include/linux/kvm_host.h         | 25 ++++-----------------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d259274238db36..ff4a32d24d5675 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -84,7 +84,8 @@ static inline void context_tracking_init(void) { }
 
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline void guest_enter(void)
+/* must be called with irqs disabled */
+static inline void guest_enter_irqoff(void)
 {
 	if (vtime_accounting_cpu_enabled())
 		vtime_guest_enter(current);
@@ -93,9 +94,19 @@ static inline void guest_enter(void)
 
 	if (context_tracking_is_enabled())
 		__context_tracking_enter(CONTEXT_GUEST);
+
+	/* KVM does not hold any references to rcu protected data when it
+	 * switches CPU into a guest mode. In fact switching to a guest mode
+	 * is very similar to exiting to userspace from rcu point of view. In
+	 * addition CPU may stay in a guest mode for quite a long time (up to
+	 * one time slice). Lets treat guest mode as quiescent state, just like
+	 * we do with user-mode execution.
+	 */
+	if (!context_tracking_cpu_is_enabled())
+		rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
 	if (context_tracking_is_enabled())
 		__context_tracking_exit(CONTEXT_GUEST);
@@ -107,7 +118,7 @@ static inline void guest_exit(void)
 }
 
 #else
-static inline void guest_enter(void)
+static inline void guest_enter_irqoff(void)
 {
 	/*
 	 * This is running in ioctl context so its safe
@@ -116,9 +127,10 @@ static inline void guest_enter(void)
 	 */
 	vtime_account_system(current);
 	current->flags |= PF_VCPU;
+	rcu_virt_note_context_switch(smp_processor_id());
 }
 
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
 {
 	/* Flush the guest cputime we spent on the guest */
 	vtime_account_system(current);
@@ -126,4 +138,22 @@ static inline void guest_exit(void)
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
 
+static inline void guest_enter(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	guest_enter_irqoff();
+	local_irq_restore(flags);
+}
+
+static inline void guest_exit(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	guest_exit_irqoff();
+	local_irq_restore(flags);
+}
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0640ee92b97872..ffff405226887b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -878,40 +878,23 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 /* must be called with irqs disabled */
 static inline void __kvm_guest_enter(void)
 {
-	guest_enter();
-	/* KVM does not hold any references to rcu protected data when it
-	 * switches CPU into a guest mode. In fact switching to a guest mode
-	 * is very similar to exiting to userspace from rcu point of view. In
-	 * addition CPU may stay in a guest mode for quite a long time (up to
-	 * one time slice). Lets treat guest mode as quiescent state, just like
-	 * we do with user-mode execution.
-	 */
-	if (!context_tracking_cpu_is_enabled())
-		rcu_virt_note_context_switch(smp_processor_id());
+	guest_enter_irqoff();
 }
 
 /* must be called with irqs disabled */
 static inline void __kvm_guest_exit(void)
 {
-	guest_exit();
+	guest_exit_irqoff();
 }
 
 static inline void kvm_guest_enter(void)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__kvm_guest_enter();
-	local_irq_restore(flags);
+	guest_enter();
 }
 
 static inline void kvm_guest_exit(void)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__kvm_guest_exit();
-	local_irq_restore(flags);
+	guest_exit();
 }
 
 /*

From c8dddecdeb2ae95a6535855ce8a26b7197471b16 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 13 Jun 2016 15:00:45 +0100
Subject: [PATCH 173/302] arm/arm64: KVM: Add a protection parameter to
 create_hyp_mappings

Currently, create_hyp_mappings applies a "one size fits all" page
protection (PAGE_HYP). As we're heading towards separate protections
for different sections, let's make this protection a parameter, and
let the callers pass their prefered protection (PAGE_HYP for everyone
for the time being).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   |  2 +-
 arch/arm/kvm/arm.c               | 13 +++++++------
 arch/arm/kvm/mmu.c               |  5 +++--
 arch/arm64/include/asm/kvm_mmu.h |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index f9a65061130b66..6cb4d4d5c48c41 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -49,7 +49,7 @@
 #include <asm/pgalloc.h>
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index f20ca84537f5d7..45dd6df70cdf87 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -122,7 +122,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (ret)
 		goto out_fail_alloc;
 
-	ret = create_hyp_mappings(kvm, kvm + 1);
+	ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
 	if (ret)
 		goto out_free_stage2_pgd;
 
@@ -239,7 +239,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_vcpu;
 
-	err = create_hyp_mappings(vcpu, vcpu + 1);
+	err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
 	if (err)
 		goto vcpu_uninit;
 
@@ -1293,14 +1293,14 @@ static int init_hyp_mode(void)
 	 * Map the Hyp-code called directly from the host
 	 */
 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-				  kvm_ksym_ref(__hyp_text_end));
+				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP);
 	if (err) {
 		kvm_err("Cannot map world-switch code\n");
 		goto out_err;
 	}
 
 	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-				  kvm_ksym_ref(__end_rodata));
+				  kvm_ksym_ref(__end_rodata), PAGE_HYP);
 	if (err) {
 		kvm_err("Cannot map rodata section\n");
 		goto out_err;
@@ -1311,7 +1311,8 @@ static int init_hyp_mode(void)
 	 */
 	for_each_possible_cpu(cpu) {
 		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-		err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
+		err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+					  PAGE_HYP);
 
 		if (err) {
 			kvm_err("Cannot map hyp stack\n");
@@ -1323,7 +1324,7 @@ static int init_hyp_mode(void)
 		kvm_cpu_context_t *cpu_ctxt;
 
 		cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
-		err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
+		err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
 
 		if (err) {
 			kvm_err("Cannot map host CPU state: %d\n", err);
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 45c43aecb8f2f3..49cb5ccf6c2377 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -679,12 +679,13 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
  * @from:	The virtual kernel start address of the range
  * @to:		The virtual kernel end address of the range (exclusive)
+ * @prot:	The protection to be applied to this range
  *
  * The same virtual address as the kernel virtual address is also used
  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  * physical pages.
  */
-int create_hyp_mappings(void *from, void *to)
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 {
 	phys_addr_t phys_addr;
 	unsigned long virt_addr;
@@ -704,7 +705,7 @@ int create_hyp_mappings(void *from, void *to)
 		err = __create_hyp_mappings(hyp_pgd, virt_addr,
 					    virt_addr + PAGE_SIZE,
 					    __phys_to_pfn(phys_addr),
-					    PAGE_HYP);
+					    prot);
 		if (err)
 			return err;
 	}
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index f05ac27d033ed8..fdfbddbe9fbac5 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -81,7 +81,7 @@ alternative_endif
 
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);

From 1166f3fe6a86798e4fcd24cefb6b06da3fd0420f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 13 Jun 2016 15:00:46 +0100
Subject: [PATCH 174/302] arm64: Add PTE_HYP_XN page table flag

EL2 page tables can be configured to deny code from being
executed, which is done by setting bit 54 in the page descriptor.

It is the same bit as PTE_UXN, but the "USER" reference felt odd
in the hypervisor code.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/pgtable-hwdef.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 2813748e2f242c..c3ae239db3eeec 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -164,6 +164,7 @@
 #define PTE_CONT		(_AT(pteval_t, 1) << 52)	/* Contiguous range */
 #define PTE_PXN			(_AT(pteval_t, 1) << 53)	/* Privileged XN */
 #define PTE_UXN			(_AT(pteval_t, 1) << 54)	/* User XN */
+#define PTE_HYP_XN		(_AT(pteval_t, 1) << 54)	/* HYP XN */
 
 /*
  * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers).

From 74a6b8885f7026e33f8a4776b7ac17c76b8e5a52 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 13 Jun 2016 15:00:47 +0100
Subject: [PATCH 175/302] arm/arm64: KVM: Enforce HYP read-only mapping of the
 kernel's rodata section

In order to be able to use C code in HYP, we're now mapping the kernel's
rodata in HYP. It works absolutely fine, except that we're mapping it RWX,
which is not what it should be.

Add a new HYP_PAGE_RO protection, and pass it as the protection flags
when mapping the rodata section.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/pgtable.h        | 1 +
 arch/arm/kvm/arm.c                    | 2 +-
 arch/arm64/include/asm/pgtable-prot.h | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 348caabb7625ee..f3320870a90af3 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -98,6 +98,7 @@ extern pgprot_t		pgprot_s2_device;
 #define PAGE_KERNEL		_MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC	pgprot_kernel
 #define PAGE_HYP		_MOD_PROT(pgprot_kernel, L_PTE_HYP)
+#define PAGE_HYP_RO		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
 #define PAGE_HYP_DEVICE		_MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
 #define PAGE_S2			_MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
 #define PAGE_S2_DEVICE		_MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 45dd6df70cdf87..b30897679e53c9 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1300,7 +1300,7 @@ static int init_hyp_mode(void)
 	}
 
 	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-				  kvm_ksym_ref(__end_rodata), PAGE_HYP);
+				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
 	if (err) {
 		kvm_err("Cannot map rodata section\n");
 		goto out_err;
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 29fcb33ab4019f..88db58cac5879e 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -56,6 +56,7 @@
 #define PAGE_KERNEL_EXEC_CONT	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
 #define PAGE_HYP		__pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP_RO		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE		__pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 
 #define PAGE_S2			__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)

From 5900270550cbb8a272bfc248b69531cd44dcf0d5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 13 Jun 2016 15:00:48 +0100
Subject: [PATCH 176/302] arm/arm64: KVM: Map the HYP text as read-only

There should be no reason for mapping the HYP text read/write.

As such, let's have a new set of flags (PAGE_HYP_EXEC) that allows
execution, but makes the page as read-only, and update the two call
sites that deal with mapping code.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/pgtable.h        | 1 +
 arch/arm/kvm/arm.c                    | 2 +-
 arch/arm/kvm/mmu.c                    | 6 +++---
 arch/arm64/include/asm/pgtable-prot.h | 1 +
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index f3320870a90af3..7487bf9f97dc45 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -98,6 +98,7 @@ extern pgprot_t		pgprot_s2_device;
 #define PAGE_KERNEL		_MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC	pgprot_kernel
 #define PAGE_HYP		_MOD_PROT(pgprot_kernel, L_PTE_HYP)
+#define PAGE_HYP_EXEC		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
 #define PAGE_HYP_RO		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
 #define PAGE_HYP_DEVICE		_MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
 #define PAGE_S2			_MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index b30897679e53c9..c74483fc39f2c3 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1293,7 +1293,7 @@ static int init_hyp_mode(void)
 	 * Map the Hyp-code called directly from the host
 	 */
 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP);
+				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
 	if (err) {
 		kvm_err("Cannot map world-switch code\n");
 		goto out_err;
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 49cb5ccf6c2377..679608fa1666a7 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1733,7 +1733,7 @@ int kvm_mmu_init(void)
 	err = 	__create_hyp_mappings(boot_hyp_pgd,
 				      hyp_idmap_start, hyp_idmap_end,
 				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP);
+				      PAGE_HYP_EXEC);
 
 	if (err) {
 		kvm_err("Failed to idmap %lx-%lx\n",
@@ -1756,7 +1756,7 @@ int kvm_mmu_init(void)
 	err = 	__create_hyp_mappings(boot_hyp_pgd,
 				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
 				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP);
+				      PAGE_HYP_EXEC);
 	if (err) {
 		kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
 			TRAMPOLINE_VA);
@@ -1767,7 +1767,7 @@ int kvm_mmu_init(void)
 	err = 	__create_hyp_mappings(hyp_pgd,
 				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
 				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP);
+				      PAGE_HYP_EXEC);
 	if (err) {
 		kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
 			TRAMPOLINE_VA);
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 88db58cac5879e..380204847c20ed 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -56,6 +56,7 @@
 #define PAGE_KERNEL_EXEC_CONT	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
 #define PAGE_HYP		__pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP_EXEC		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
 #define PAGE_HYP_RO		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE		__pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 

From 0996353f8ec6c6dba4a1f916bf6d9ace6f7d2b49 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 13 Jun 2016 15:00:49 +0100
Subject: [PATCH 177/302] arm/arm64: KVM: Make default HYP mappings
 non-excutable

Structures that can be generally written to don't have any requirement
to be executable (quite the opposite). This includes the kvm and vcpu
structures, as well as the stacks.

Let's change the default to incorporate the XN flag.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/pgtable.h        | 2 +-
 arch/arm64/include/asm/pgtable-prot.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 7487bf9f97dc45..e0d76ba24b3087 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -97,7 +97,7 @@ extern pgprot_t		pgprot_s2_device;
 #define PAGE_READONLY_EXEC	_MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY)
 #define PAGE_KERNEL		_MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC	pgprot_kernel
-#define PAGE_HYP		_MOD_PROT(pgprot_kernel, L_PTE_HYP)
+#define PAGE_HYP		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN)
 #define PAGE_HYP_EXEC		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
 #define PAGE_HYP_RO		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
 #define PAGE_HYP_DEVICE		_MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 380204847c20ed..39f5252673f7a2 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -55,7 +55,7 @@
 #define PAGE_KERNEL_EXEC	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_EXEC_CONT	__pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
-#define PAGE_HYP		__pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
 #define PAGE_HYP_EXEC		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
 #define PAGE_HYP_RO		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE		__pgprot(PROT_DEVICE_nGnRE | PTE_HYP)

From 6edaa5307f3f51e4e56dc4c63f68a69d88c6ddf5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 15 Jun 2016 15:18:26 +0200
Subject: [PATCH 178/302] KVM: remove kvm_guest_enter/exit wrappers

Use the functions from context_tracking.h directly.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/arm.c           |  8 ++++----
 arch/mips/kvm/mips.c         |  4 ++--
 arch/powerpc/kvm/book3s_hv.c |  4 ++--
 arch/powerpc/kvm/book3s_pr.c |  4 ++--
 arch/powerpc/kvm/booke.c     |  4 ++--
 arch/powerpc/kvm/powerpc.c   |  2 +-
 arch/s390/kvm/kvm-s390.c     |  4 ++--
 arch/s390/kvm/vsie.c         |  4 ++--
 arch/x86/kvm/x86.c           |  4 ++--
 include/linux/kvm_host.h     | 22 ----------------------
 10 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index f20ca84537f5d7..9ac4970882fefd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -615,7 +615,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * Enter the guest
 		 */
 		trace_kvm_entry(*vcpu_pc(vcpu));
-		__kvm_guest_enter();
+		guest_enter_irqoff();
 		vcpu->mode = IN_GUEST_MODE;
 
 		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
@@ -641,14 +641,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		local_irq_enable();
 
 		/*
-		 * We do local_irq_enable() before calling kvm_guest_exit() so
+		 * We do local_irq_enable() before calling guest_exit() so
 		 * that if a timer interrupt hits while running the guest we
 		 * account that tick as being spent in the guest.  We enable
-		 * preemption after calling kvm_guest_exit() so that if we get
+		 * preemption after calling guest_exit() so that if we get
 		 * preempted we make sure ticks after that is not counted as
 		 * guest time.
 		 */
-		kvm_guest_exit();
+		guest_exit();
 		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
 		/*
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 5a2b9034a05ce1..5f1163653b5062 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -406,7 +406,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	kvm_mips_deliver_interrupts(vcpu,
 				    kvm_read_c0_guest_cause(vcpu->arch.cop0));
 
-	__kvm_guest_enter();
+	guest_enter_irqoff();
 
 	/* Disable hardware page table walking while in guest */
 	htw_stop();
@@ -418,7 +418,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	/* Re-enable HTW before enabling interrupts */
 	htw_start();
 
-	__kvm_guest_exit();
+	guest_exit_irqoff();
 	local_irq_enable();
 
 	if (vcpu->sigset_active)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e20beae5ca7a46..6b2859c12ae879 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2522,7 +2522,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
 			spin_unlock(&pvc->lock);
 
-	kvm_guest_enter();
+	guest_enter();
 
 	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
 
@@ -2570,7 +2570,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
-	kvm_guest_exit();
+	guest_exit();
 
 	for (sub = 0; sub < core_info.n_subcores; ++sub)
 		list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 8e4f64f0b7741d..6a66c5ff0827ef 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* We get here with MSR.EE=1 */
 
 	trace_kvm_exit(exit_nr, vcpu);
-	kvm_guest_exit();
+	guest_exit();
 
 	switch (exit_nr) {
 	case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -1531,7 +1531,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	kvmppc_clear_debug(vcpu);
 
-	/* No need for kvm_guest_exit. It's done in handle_exit.
+	/* No need for guest_exit. It's done in handle_exit.
 	   We also get here with interrupts enabled. */
 
 	/* Make sure we save the guest FPU/Altivec/VSX state */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4afae695899ad9..02b4672f7347ec 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
-	/* No need for kvm_guest_exit. It's done in handle_exit.
+	/* No need for guest_exit. It's done in handle_exit.
 	   We also get here with interrupts enabled. */
 
 	/* Switch back to user space debug context */
@@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	}
 
 	trace_kvm_exit(exit_nr, vcpu);
-	__kvm_guest_exit();
+	guest_exit_irqoff();
 
 	local_irq_enable();
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 02416fea765301..1ac036e45ed4f8 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 			continue;
 		}
 
-		__kvm_guest_enter();
+		guest_enter_irqoff();
 		return 1;
 	}
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 03eeeb0ded2470..d42428c1179412 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2623,14 +2623,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		 * guest_enter and guest_exit should be no uaccess.
 		 */
 		local_irq_disable();
-		__kvm_guest_enter();
+		guest_enter_irqoff();
 		__disable_cpu_timer_accounting(vcpu);
 		local_irq_enable();
 		exit_reason = sie64a(vcpu->arch.sie_block,
 				     vcpu->run->s.regs.gprs);
 		local_irq_disable();
 		__enable_cpu_timer_accounting(vcpu);
-		__kvm_guest_exit();
+		guest_exit_irqoff();
 		local_irq_enable();
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 6895e7b3be1237..c106488b41371b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -765,13 +765,13 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 	local_irq_disable();
-	kvm_guest_enter();
+	guest_enter_irqoff();
 	local_irq_enable();
 
 	rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
 
 	local_irq_disable();
-	kvm_guest_exit();
+	guest_exit_irqoff();
 	local_irq_enable();
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9e50e2ad6d0829..618463abeec5b2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6658,7 +6658,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	trace_kvm_entry(vcpu->vcpu_id);
 	wait_lapic_expire(vcpu);
-	__kvm_guest_enter();
+	guest_enter_irqoff();
 
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
@@ -6717,7 +6717,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 */
 	barrier();
 
-	kvm_guest_exit();
+	guest_exit();
 
 	preempt_enable();
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ffff405226887b..66b2f6159aadd3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -875,28 +875,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 }
 #endif
 
-/* must be called with irqs disabled */
-static inline void __kvm_guest_enter(void)
-{
-	guest_enter_irqoff();
-}
-
-/* must be called with irqs disabled */
-static inline void __kvm_guest_exit(void)
-{
-	guest_exit_irqoff();
-}
-
-static inline void kvm_guest_enter(void)
-{
-	guest_enter();
-}
-
-static inline void kvm_guest_exit(void)
-{
-	guest_exit();
-}
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.

From 91fa0f8e9e2937fd9360f326ad60d51908347afd Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 15 Jun 2016 20:55:08 +0200
Subject: [PATCH 179/302] KVM: x86: always use "acknowledge interrupt on exit"

This is necessary to simplify handle_external_intr in the next patch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 943609f06c90c0..1b413a520ef3c3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3390,12 +3390,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		      vmx_capability.ept, vmx_capability.vpid);
 	}
 
-	min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
 	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
-		VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+		VM_EXIT_CLEAR_BNDCFGS;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -3408,8 +3408,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		return -EIO;
 
 	if (!(_cpu_based_2nd_exec_control &
-		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
-		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
 		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
 
 	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;

From f2485b3e0c6c0aa3a9546babc2fad3739e964ebb Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 15 Jun 2016 15:23:11 +0200
Subject: [PATCH 180/302] KVM: x86: use guest_exit_irqoff

This gains a few clock cycles per vmexit.  On Intel there is no need
anymore to enable the interrupts in vmx_handle_external_intr, since
we are using the "acknowledge interrupt on exit" feature.  AMD
needs to do that, and must be careful to avoid the interrupt shadow.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c |  6 ++++++
 arch/x86/kvm/vmx.c |  4 +---
 arch/x86/kvm/x86.c | 11 ++---------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5ff2927781100f..5bfdbbf1ce793d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4935,6 +4935,12 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
 {
 	local_irq_enable();
+	/*
+	 * We must have an instruction with interrupts enabled, so
+	 * the timer interrupt isn't delayed by the interrupt shadow.
+	 */
+	asm("nop");
+	local_irq_disable();
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1b413a520ef3c3..c1d655c10fd245 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8574,7 +8574,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			"push %[sp]\n\t"
 #endif
 			"pushf\n\t"
-			"orl $0x200, (%%" _ASM_SP ")\n\t"
 			__ASM_SIZE(push) " $%c[cs]\n\t"
 			"call *%[entry]\n\t"
 			:
@@ -8587,8 +8586,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
-	} else
-		local_irq_enable();
+	}
 }
 
 static bool vmx_has_high_real_mode_segbase(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 618463abeec5b2..0cc6cf834cdd7f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6709,16 +6709,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.exits;
 
-	/*
-	 * We must have an instruction between local_irq_enable() and
-	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
-	 * the interrupt shadow.  The stat.exits increment will do nicely.
-	 * But we need to prevent reordering, hence this barrier():
-	 */
-	barrier();
-
-	guest_exit();
+	guest_exit_irqoff();
 
+	local_irq_enable();
 	preempt_enable();
 
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

From 9175d2e97b08e86293e68246020a5c29f88aa674 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 27 Jun 2016 15:08:01 +0200
Subject: [PATCH 181/302] KVM: vmx: fix underflow in TSC deadline calculation

If the TSC deadline timer is programmed really close to the deadline or
even in the past, the computation in vmx_set_hv_timer can underflow and
cause delta_tsc to be set to a huge value.  This generally results
in vmx_set_hv_timer returning -ERANGE, but we can fix it by limiting
delta_tsc to be positive or zero.

Reported-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c1d655c10fd245..85e2f0a882ca99 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10829,9 +10829,9 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tscl = rdtsc(), delta_tsc;
-
-	delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl);
+	u64 tscl = rdtsc();
+	u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+	u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
 	/* Convert to host delta tsc if tsc scaling is enabled */
 	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&

From bd97ad0e7ed6a8870cc691fdfd108dc952fe45eb Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 30 Jun 2016 08:52:49 +0800
Subject: [PATCH 182/302] KVM: x86: introduce cancel_hv_tscdeadline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce cancel_hv_tscdeadline() to encapsulate preemption
timer cancel stuff.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fdc05ae08bac94..9c20ac14caf59e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1349,14 +1349,19 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+	apic->lapic_timer.hv_timer_in_use = false;
+}
+
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
 	WARN_ON(swait_active(&vcpu->wq));
-	kvm_x86_ops->cancel_hv_timer(vcpu);
-	apic->lapic_timer.hv_timer_in_use = false;
+	cancel_hv_tscdeadline(apic);
 	apic_timer_expired(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
@@ -1376,10 +1381,8 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 			hrtimer_cancel(&apic->lapic_timer.timer);
 
 			/* In case the sw timer triggered in the window */
-			if (atomic_read(&apic->lapic_timer.pending)) {
-				apic->lapic_timer.hv_timer_in_use = false;
-				kvm_x86_ops->cancel_hv_timer(apic->vcpu);
-			}
+			if (atomic_read(&apic->lapic_timer.pending))
+				cancel_hv_tscdeadline(apic);
 		}
 		trace_kvm_hv_timer_state(vcpu->vcpu_id,
 				apic->lapic_timer.hv_timer_in_use);
@@ -1395,8 +1398,7 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
 	if (!apic->lapic_timer.hv_timer_in_use)
 		return;
 
-	kvm_x86_ops->cancel_hv_timer(vcpu);
-	apic->lapic_timer.hv_timer_in_use = false;
+	cancel_hv_tscdeadline(apic);
 
 	if (atomic_read(&apic->lapic_timer.pending))
 		return;

From 196f20ca52e8c7281932663c348fa54b82d03914 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <kernellwp@gmail.com>
Date: Tue, 28 Jun 2016 14:54:19 +0800
Subject: [PATCH 183/302] KVM: vmx: fix missed cancellation of TSC deadline
 timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

INFO: rcu_sched detected stalls on CPUs/tasks:
 1-...: (11800 GPs behind) idle=45d/140000000000000/0 softirq=0/0 fqs=21663
 (detected by 0, t=65016 jiffies, g=11500, c=11499, q=719)
Task dump for CPU 1:
qemu-system-x86 R  running task        0  3529   3525 0x00080808
 ffff8802021791a0 ffff880212895040 0000000000000001 00007f1c2c00db40
 ffff8801dd20fcd3 ffffc90002b98000 ffff8801dd20fc88 ffff8801dd20fcf8
 0000000000000286 ffff8801dd2ac538 ffff8801dd20fcc0 ffffffffc06949c9
Call Trace:
? kvm_write_guest_cached+0xb9/0x160 [kvm]
? __delay+0xf/0x20
? wait_lapic_expire+0x14a/0x200 [kvm]
? kvm_arch_vcpu_ioctl_run+0xcbe/0x1b00 [kvm]
? kvm_arch_vcpu_ioctl_run+0xe34/0x1b00 [kvm]
? kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm]
? __fget+0x5/0x210
? do_vfs_ioctl+0x96/0x6a0
? __fget_light+0x2a/0x90
? SyS_ioctl+0x79/0x90
? do_syscall_64+0x7c/0x1e0
? entry_SYSCALL64_slow_path+0x25/0x25

This can be reproduced readily by running a full dynticks guest(since hrtimer
in guest is heavily used) w/ lapic_timer_advance disabled.

If fail to program hardware preemption timer, we will fallback to hrtimer based
method, however, a previous programmed preemption timer miss to cancel in this
scenario which results in one hardware preemption timer and one hrtimer emulated
tsc deadline timer run simultaneously. So sometimes the target guest deadline
tsc is earlier than guest tsc, which leads to the computation in vmx_set_hv_timer
can underflow and cause delta_tsc to be set a huge value, then host soft lockup
as above.

This patch fix it by cancelling the previous programmed preemption timer if there
is once we failed to program the new preemption timer and fallback to hrtimer
based method.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 48 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9c20ac14caf59e..22a6474af220e9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1366,27 +1366,35 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
 
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+	u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+	if (atomic_read(&apic->lapic_timer.pending) ||
+		kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+		if (apic->lapic_timer.hv_timer_in_use)
+			cancel_hv_tscdeadline(apic);
+	} else {
+		apic->lapic_timer.hv_timer_in_use = true;
+		hrtimer_cancel(&apic->lapic_timer.timer);
+
+		/* In case the sw timer triggered in the window */
+		if (atomic_read(&apic->lapic_timer.pending))
+			cancel_hv_tscdeadline(apic);
+	}
+	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+			apic->lapic_timer.hv_timer_in_use);
+	return apic->lapic_timer.hv_timer_in_use;
+}
+
 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	WARN_ON(apic->lapic_timer.hv_timer_in_use);
 
-	if (apic_lvtt_tscdeadline(apic) &&
-	    !atomic_read(&apic->lapic_timer.pending)) {
-		u64 tscdeadline = apic->lapic_timer.tscdeadline;
-
-		if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) {
-			apic->lapic_timer.hv_timer_in_use = true;
-			hrtimer_cancel(&apic->lapic_timer.timer);
-
-			/* In case the sw timer triggered in the window */
-			if (atomic_read(&apic->lapic_timer.pending))
-				cancel_hv_tscdeadline(apic);
-		}
-		trace_kvm_hv_timer_state(vcpu->vcpu_id,
-				apic->lapic_timer.hv_timer_in_use);
-	}
+	if (apic_lvtt_tscdeadline(apic))
+		start_hv_tscdeadline(apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
 
@@ -1453,15 +1461,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		/* lapic timer in tsc deadline mode */
-		u64 tscdeadline = apic->lapic_timer.tscdeadline;
-
-		if (kvm_x86_ops->set_hv_timer &&
-		    !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
-			apic->lapic_timer.hv_timer_in_use = true;
-			trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
-					apic->lapic_timer.hv_timer_in_use);
-		} else
+		if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
 			start_sw_tscdeadline(apic);
 	}
 }

From 50926d82fa271fa76d5717b546a66f7b5703ff05 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sat, 28 May 2016 11:27:11 +0100
Subject: [PATCH 184/302] KVM: arm/arm64: The GIC is dead, long live the GIC

I don't think any single piece of the KVM/ARM code ever generated
as much hatred as the GIC emulation.

It was written by someone who had zero experience in modeling
hardware (me), was riddled with design flaws, should have been
scrapped and rewritten from scratch long before having a remote
chance of reaching mainline, and yet we supported it for a good
three years. No need to mention the names of those who suffered,
the git log is singing their praises.

Thankfully, we now have a much more maintainable implementation,
and we can safely put the grumpy old GIC to rest.

Fellow hackers, please raise your glass in memory of the GIC:

	The GIC is dead, long live the GIC!

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/kvm/Kconfig          |    7 -
 arch/arm/kvm/Makefile         |    6 -
 arch/arm64/kvm/Kconfig        |    7 -
 arch/arm64/kvm/Makefile       |    8 -
 include/kvm/arm_vgic.h        |  381 ++---
 include/kvm/vgic/vgic.h       |  246 ----
 virt/kvm/arm/hyp/vgic-v2-sr.c |   15 +-
 virt/kvm/arm/vgic-v2-emul.c   |  856 ------------
 virt/kvm/arm/vgic-v2.c        |  274 ----
 virt/kvm/arm/vgic-v3-emul.c   | 1074 ---------------
 virt/kvm/arm/vgic-v3.c        |  279 ----
 virt/kvm/arm/vgic.c           | 2440 ---------------------------------
 virt/kvm/arm/vgic.h           |  140 --
 13 files changed, 130 insertions(+), 5603 deletions(-)
 delete mode 100644 include/kvm/vgic/vgic.h
 delete mode 100644 virt/kvm/arm/vgic-v2-emul.c
 delete mode 100644 virt/kvm/arm/vgic-v2.c
 delete mode 100644 virt/kvm/arm/vgic-v3-emul.c
 delete mode 100644 virt/kvm/arm/vgic-v3.c
 delete mode 100644 virt/kvm/arm/vgic.c
 delete mode 100644 virt/kvm/arm/vgic.h

diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 02abfff68ee542..95a000515e4328 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -46,13 +46,6 @@ config KVM_ARM_HOST
 	---help---
 	  Provides host support for ARM processors.
 
-config KVM_NEW_VGIC
-	bool "New VGIC implementation"
-	depends on KVM
-	default y
-	---help---
-	  uses the new VGIC implementation
-
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index a596b58f6d37d0..5e28df80dca7a2 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -22,7 +22,6 @@ obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
 
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
 obj-y += $(KVM)/arm/vgic/vgic.o
 obj-y += $(KVM)/arm/vgic/vgic-init.o
 obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,9 +29,4 @@ obj-y += $(KVM)/arm/vgic/vgic-v2.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio.o
 obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
 obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-obj-y += $(KVM)/arm/vgic.o
-obj-y += $(KVM)/arm/vgic-v2.o
-obj-y += $(KVM)/arm/vgic-v2-emul.o
-endif
 obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index c4f26ef91e772f..aa2e34e99582df 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -54,13 +54,6 @@ config KVM_ARM_PMU
 	  Adds support for a virtual Performance Monitoring Unit (PMU) in
 	  virtual machines.
 
-config KVM_NEW_VGIC
-	bool "New VGIC implementation"
-	depends on KVM
-	default y
-        ---help---
-          uses the new VGIC implementation
-
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index a7a958ca29d56a..f00b2cdd0d337d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -20,7 +20,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,12 +29,5 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-endif
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index da0a524802cbf4..12640378db9899 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -1,6 +1,5 @@
 /*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
+ * Copyright (C) 2015, 2016 ARM Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -12,16 +11,10 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-
-#ifndef __ASM_ARM_KVM_VGIC_H
-#define __ASM_ARM_KVM_VGIC_H
-
-#ifdef CONFIG_KVM_NEW_VGIC
-#include <kvm/vgic/vgic.h>
-#else
+#ifndef __KVM_ARM_VGIC_H
+#define __KVM_ARM_VGIC_H
 
 #include <linux/kernel.h>
 #include <linux/kvm.h>
@@ -29,248 +22,130 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
 
-#define VGIC_NR_IRQS_LEGACY	256
+#define VGIC_V3_MAX_CPUS	255
+#define VGIC_V2_MAX_CPUS	8
+#define VGIC_NR_IRQS_LEGACY     256
 #define VGIC_NR_SGIS		16
 #define VGIC_NR_PPIS		16
 #define VGIC_NR_PRIVATE_IRQS	(VGIC_NR_SGIS + VGIC_NR_PPIS)
+#define VGIC_MAX_PRIVATE	(VGIC_NR_PRIVATE_IRQS - 1)
+#define VGIC_MAX_SPI		1019
+#define VGIC_MAX_RESERVED	1023
+#define VGIC_MIN_LPI		8192
 
-#define VGIC_V2_MAX_LRS		(1 << 6)
-#define VGIC_V3_MAX_LRS		16
-#define VGIC_MAX_IRQS		1024
-#define VGIC_V2_MAX_CPUS	8
-#define VGIC_V3_MAX_CPUS	255
-
-#if (VGIC_NR_IRQS_LEGACY & 31)
-#error "VGIC_NR_IRQS must be a multiple of 32"
-#endif
+enum vgic_type {
+	VGIC_V2,		/* Good ol' GICv2 */
+	VGIC_V3,		/* New fancy GICv3 */
+};
 
-#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS)
-#error "VGIC_NR_IRQS must be <= 1024"
-#endif
+/* same for all guests, as depending only on the _host's_ GIC model */
+struct vgic_global {
+	/* type of the host GIC */
+	enum vgic_type		type;
 
-/*
- * The GIC distributor registers describing interrupts have two parts:
- * - 32 per-CPU interrupts (SGI + PPI)
- * - a bunch of shared interrupts (SPI)
- */
-struct vgic_bitmap {
-	/*
-	 * - One UL per VCPU for private interrupts (assumes UL is at
-	 *   least 32 bits)
-	 * - As many UL as necessary for shared interrupts.
-	 *
-	 * The private interrupts are accessed via the "private"
-	 * field, one UL per vcpu (the state for vcpu n is in
-	 * private[n]). The shared interrupts are accessed via the
-	 * "shared" pointer (IRQn state is at bit n-32 in the bitmap).
-	 */
-	unsigned long *private;
-	unsigned long *shared;
-};
+	/* Physical address of vgic virtual cpu interface */
+	phys_addr_t		vcpu_base;
 
-struct vgic_bytemap {
-	/*
-	 * - 8 u32 per VCPU for private interrupts
-	 * - As many u32 as necessary for shared interrupts.
-	 *
-	 * The private interrupts are accessed via the "private"
-	 * field, (the state for vcpu n is in private[n*8] to
-	 * private[n*8 + 7]). The shared interrupts are accessed via
-	 * the "shared" pointer (IRQn state is at byte (n-32)%4 of the
-	 * shared[(n-32)/4] word).
-	 */
-	u32 *private;
-	u32 *shared;
-};
+	/* virtual control interface mapping */
+	void __iomem		*vctrl_base;
 
-struct kvm_vcpu;
+	/* Number of implemented list registers */
+	int			nr_lr;
 
-enum vgic_type {
-	VGIC_V2,		/* Good ol' GICv2 */
-	VGIC_V3,		/* New fancy GICv3 */
-};
+	/* Maintenance IRQ number */
+	unsigned int		maint_irq;
 
-#define LR_STATE_PENDING	(1 << 0)
-#define LR_STATE_ACTIVE		(1 << 1)
-#define LR_STATE_MASK		(3 << 0)
-#define LR_EOI_INT		(1 << 2)
-#define LR_HW			(1 << 3)
+	/* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+	int			max_gic_vcpus;
 
-struct vgic_lr {
-	unsigned irq:10;
-	union {
-		unsigned hwirq:10;
-		unsigned source:3;
-	};
-	unsigned state:4;
+	/* Only needed for the legacy KVM_CREATE_IRQCHIP */
+	bool			can_emulate_gicv2;
 };
 
-struct vgic_vmcr {
-	u32	ctlr;
-	u32	abpr;
-	u32	bpr;
-	u32	pmr;
-};
+extern struct vgic_global kvm_vgic_global_state;
 
-struct vgic_ops {
-	struct vgic_lr	(*get_lr)(const struct kvm_vcpu *, int);
-	void	(*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
-	u64	(*get_elrsr)(const struct kvm_vcpu *vcpu);
-	u64	(*get_eisr)(const struct kvm_vcpu *vcpu);
-	void	(*clear_eisr)(struct kvm_vcpu *vcpu);
-	u32	(*get_interrupt_status)(const struct kvm_vcpu *vcpu);
-	void	(*enable_underflow)(struct kvm_vcpu *vcpu);
-	void	(*disable_underflow)(struct kvm_vcpu *vcpu);
-	void	(*get_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-	void	(*set_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-	void	(*enable)(struct kvm_vcpu *vcpu);
-};
+#define VGIC_V2_MAX_LRS		(1 << 6)
+#define VGIC_V3_MAX_LRS		16
+#define VGIC_V3_LR_INDEX(lr)	(VGIC_V3_MAX_LRS - 1 - lr)
 
-struct vgic_params {
-	/* vgic type */
-	enum vgic_type	type;
-	/* Physical address of vgic virtual cpu interface */
-	phys_addr_t	vcpu_base;
-	/* Number of list registers */
-	u32		nr_lr;
-	/* Interrupt number */
-	unsigned int	maint_irq;
-	/* Virtual control interface base address */
-	void __iomem	*vctrl_base;
-	int		max_gic_vcpus;
-	/* Only needed for the legacy KVM_CREATE_IRQCHIP */
-	bool		can_emulate_gicv2;
+enum vgic_irq_config {
+	VGIC_CONFIG_EDGE = 0,
+	VGIC_CONFIG_LEVEL
 };
 
-struct vgic_vm_ops {
-	bool	(*queue_sgi)(struct kvm_vcpu *, int irq);
-	void	(*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
-	int	(*init_model)(struct kvm *);
-	int	(*map_resources)(struct kvm *, const struct vgic_params *);
+struct vgic_irq {
+	spinlock_t irq_lock;		/* Protects the content of the struct */
+	struct list_head ap_list;
+
+	struct kvm_vcpu *vcpu;		/* SGIs and PPIs: The VCPU
+					 * SPIs and LPIs: The VCPU whose ap_list
+					 * this is queued on.
+					 */
+
+	struct kvm_vcpu *target_vcpu;	/* The VCPU that this interrupt should
+					 * be sent to, as a result of the
+					 * targets reg (v2) or the
+					 * affinity reg (v3).
+					 */
+
+	u32 intid;			/* Guest visible INTID */
+	bool pending;
+	bool line_level;		/* Level only */
+	bool soft_pending;		/* Level only */
+	bool active;			/* not used for LPIs */
+	bool enabled;
+	bool hw;			/* Tied to HW IRQ */
+	u32 hwintid;			/* HW INTID number */
+	union {
+		u8 targets;			/* GICv2 target VCPUs mask */
+		u32 mpidr;			/* GICv3 target VCPU */
+	};
+	u8 source;			/* GICv2 SGIs only */
+	u8 priority;
+	enum vgic_irq_config config;	/* Level or edge */
 };
 
+struct vgic_register_region;
+
 struct vgic_io_device {
-	gpa_t addr;
-	int len;
-	const struct vgic_io_range *reg_ranges;
+	gpa_t base_addr;
 	struct kvm_vcpu *redist_vcpu;
+	const struct vgic_register_region *regions;
+	int nr_regions;
 	struct kvm_io_device dev;
 };
 
-struct irq_phys_map {
-	u32			virt_irq;
-	u32			phys_irq;
-};
-
-struct irq_phys_map_entry {
-	struct list_head	entry;
-	struct rcu_head		rcu;
-	struct irq_phys_map	map;
-};
-
 struct vgic_dist {
-	spinlock_t		lock;
 	bool			in_kernel;
 	bool			ready;
+	bool			initialized;
 
 	/* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
 	u32			vgic_model;
 
-	int			nr_cpus;
-	int			nr_irqs;
+	int			nr_spis;
 
+	/* TODO: Consider moving to global state */
 	/* Virtual control interface mapping */
 	void __iomem		*vctrl_base;
 
-	/* Distributor and vcpu interface mapping in the guest */
-	phys_addr_t		vgic_dist_base;
-	/* GICv2 and GICv3 use different mapped register blocks */
+	/* base addresses in guest physical address space: */
+	gpa_t			vgic_dist_base;		/* distributor */
 	union {
-		phys_addr_t		vgic_cpu_base;
-		phys_addr_t		vgic_redist_base;
+		/* either a GICv2 CPU interface */
+		gpa_t			vgic_cpu_base;
+		/* or a number of GICv3 redistributor regions */
+		gpa_t			vgic_redist_base;
 	};
 
-	/* Distributor enabled */
-	u32			enabled;
-
-	/* Interrupt enabled (one bit per IRQ) */
-	struct vgic_bitmap	irq_enabled;
-
-	/* Level-triggered interrupt external input is asserted */
-	struct vgic_bitmap	irq_level;
-
-	/*
-	 * Interrupt state is pending on the distributor
-	 */
-	struct vgic_bitmap	irq_pending;
-
-	/*
-	 * Tracks writes to GICD_ISPENDRn and GICD_ICPENDRn for level-triggered
-	 * interrupts.  Essentially holds the state of the flip-flop in
-	 * Figure 4-10 on page 4-101 in ARM IHI 0048B.b.
-	 * Once set, it is only cleared for level-triggered interrupts on
-	 * guest ACKs (when we queue it) or writes to GICD_ICPENDRn.
-	 */
-	struct vgic_bitmap	irq_soft_pend;
-
-	/* Level-triggered interrupt queued on VCPU interface */
-	struct vgic_bitmap	irq_queued;
-
-	/* Interrupt was active when unqueue from VCPU interface */
-	struct vgic_bitmap	irq_active;
+	/* distributor enabled */
+	bool			enabled;
 
-	/* Interrupt priority. Not used yet. */
-	struct vgic_bytemap	irq_priority;
+	struct vgic_irq		*spis;
 
-	/* Level/edge triggered */
-	struct vgic_bitmap	irq_cfg;
-
-	/*
-	 * Source CPU per SGI and target CPU:
-	 *
-	 * Each byte represent a SGI observable on a VCPU, each bit of
-	 * this byte indicating if the corresponding VCPU has
-	 * generated this interrupt. This is a GICv2 feature only.
-	 *
-	 * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are
-	 * the SGIs observable on VCPUn.
-	 */
-	u8			*irq_sgi_sources;
-
-	/*
-	 * Target CPU for each SPI:
-	 *
-	 * Array of available SPI, each byte indicating the target
-	 * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32].
-	 */
-	u8			*irq_spi_cpu;
-
-	/*
-	 * Reverse lookup of irq_spi_cpu for faster compute pending:
-	 *
-	 * Array of bitmaps, one per VCPU, describing if IRQn is
-	 * routed to a particular VCPU.
-	 */
-	struct vgic_bitmap	*irq_spi_target;
-
-	/* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
-	u32			*irq_spi_mpidr;
-
-	/* Bitmap indicating which CPU has something pending */
-	unsigned long		*irq_pending_on_cpu;
-
-	/* Bitmap indicating which CPU has active IRQs */
-	unsigned long		*irq_active_on_cpu;
-
-	struct vgic_vm_ops	vm_ops;
 	struct vgic_io_device	dist_iodev;
 	struct vgic_io_device	*redist_iodevs;
-
-	/* Virtual irq to hwirq mapping */
-	spinlock_t		irq_phys_map_lock;
-	struct list_head	irq_phys_map_list;
 };
 
 struct vgic_v2_cpu_if {
@@ -298,78 +173,74 @@ struct vgic_v3_cpu_if {
 };
 
 struct vgic_cpu {
-	/* Pending/active/both interrupts on this VCPU */
-	DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
-	DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
-	DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
-
-	/* Pending/active/both shared interrupts, dynamically sized */
-	unsigned long	*pending_shared;
-	unsigned long   *active_shared;
-	unsigned long   *pend_act_shared;
-
 	/* CPU vif control registers for world switch */
 	union {
 		struct vgic_v2_cpu_if	vgic_v2;
 		struct vgic_v3_cpu_if	vgic_v3;
 	};
 
-	/* Protected by the distributor's irq_phys_map_lock */
-	struct list_head	irq_phys_map_list;
-
-	u64		live_lrs;
-};
+	unsigned int used_lrs;
+	struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
 
-#define LR_EMPTY	0xff
+	spinlock_t ap_list_lock;	/* Protects the ap_list */
 
-#define INT_STATUS_EOI		(1 << 0)
-#define INT_STATUS_UNDERFLOW	(1 << 1)
+	/*
+	 * List of IRQs that this VCPU should consider because they are either
+	 * Active or Pending (hence the name; AP list), or because they recently
+	 * were one of the two and need to be migrated off this list to another
+	 * VCPU.
+	 */
+	struct list_head ap_list_head;
 
-struct kvm;
-struct kvm_vcpu;
+	u64 live_lrs;
+};
 
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-int kvm_vgic_hyp_init(void);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_get_max_vcpus(void);
 void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
 void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
+int kvm_vgic_map_resources(struct kvm *kvm);
+int kvm_vgic_hyp_init(void);
+
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
 			bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-			       unsigned int virt_irq, bool level);
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+			       bool level);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
+#define vgic_initialized(k)	((k)->arch.vgic.initialized)
 #define vgic_ready(k)		((k)->arch.vgic.ready)
 #define vgic_valid_spi(k, i)	(((i) >= VGIC_NR_PRIVATE_IRQS) && \
-				 ((i) < (k)->arch.vgic.nr_irqs))
+			((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+
+bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
-		  const struct vgic_ops **ops,
-		  const struct vgic_params **params);
 #ifdef CONFIG_KVM_ARM_VGIC_V3
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-		  const struct vgic_ops **ops,
-		  const struct vgic_params **params);
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 #else
-static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-				const struct vgic_ops **ops,
-				const struct vgic_params **params)
+static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
 {
-	return -ENODEV;
 }
 #endif
 
-#endif	/* old VGIC include */
-#endif
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+static inline int kvm_vgic_get_max_vcpus(void)
+{
+	return kvm_vgic_global_state.max_gic_vcpus;
+}
+
+#endif /* __KVM_ARM_VGIC_H */
diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
deleted file mode 100644
index 3fbd175265ae4f..00000000000000
--- a/include/kvm/vgic/vgic.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_ARM_KVM_VGIC_VGIC_H
-#define __ASM_ARM_KVM_VGIC_VGIC_H
-
-#include <linux/kernel.h>
-#include <linux/kvm.h>
-#include <linux/irqreturn.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <kvm/iodev.h>
-
-#define VGIC_V3_MAX_CPUS	255
-#define VGIC_V2_MAX_CPUS	8
-#define VGIC_NR_IRQS_LEGACY     256
-#define VGIC_NR_SGIS		16
-#define VGIC_NR_PPIS		16
-#define VGIC_NR_PRIVATE_IRQS	(VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_MAX_PRIVATE	(VGIC_NR_PRIVATE_IRQS - 1)
-#define VGIC_MAX_SPI		1019
-#define VGIC_MAX_RESERVED	1023
-#define VGIC_MIN_LPI		8192
-
-enum vgic_type {
-	VGIC_V2,		/* Good ol' GICv2 */
-	VGIC_V3,		/* New fancy GICv3 */
-};
-
-/* same for all guests, as depending only on the _host's_ GIC model */
-struct vgic_global {
-	/* type of the host GIC */
-	enum vgic_type		type;
-
-	/* Physical address of vgic virtual cpu interface */
-	phys_addr_t		vcpu_base;
-
-	/* virtual control interface mapping */
-	void __iomem		*vctrl_base;
-
-	/* Number of implemented list registers */
-	int			nr_lr;
-
-	/* Maintenance IRQ number */
-	unsigned int		maint_irq;
-
-	/* maximum number of VCPUs allowed (GICv2 limits us to 8) */
-	int			max_gic_vcpus;
-
-	/* Only needed for the legacy KVM_CREATE_IRQCHIP */
-	bool			can_emulate_gicv2;
-};
-
-extern struct vgic_global kvm_vgic_global_state;
-
-#define VGIC_V2_MAX_LRS		(1 << 6)
-#define VGIC_V3_MAX_LRS		16
-#define VGIC_V3_LR_INDEX(lr)	(VGIC_V3_MAX_LRS - 1 - lr)
-
-enum vgic_irq_config {
-	VGIC_CONFIG_EDGE = 0,
-	VGIC_CONFIG_LEVEL
-};
-
-struct vgic_irq {
-	spinlock_t irq_lock;		/* Protects the content of the struct */
-	struct list_head ap_list;
-
-	struct kvm_vcpu *vcpu;		/* SGIs and PPIs: The VCPU
-					 * SPIs and LPIs: The VCPU whose ap_list
-					 * this is queued on.
-					 */
-
-	struct kvm_vcpu *target_vcpu;	/* The VCPU that this interrupt should
-					 * be sent to, as a result of the
-					 * targets reg (v2) or the
-					 * affinity reg (v3).
-					 */
-
-	u32 intid;			/* Guest visible INTID */
-	bool pending;
-	bool line_level;		/* Level only */
-	bool soft_pending;		/* Level only */
-	bool active;			/* not used for LPIs */
-	bool enabled;
-	bool hw;			/* Tied to HW IRQ */
-	u32 hwintid;			/* HW INTID number */
-	union {
-		u8 targets;			/* GICv2 target VCPUs mask */
-		u32 mpidr;			/* GICv3 target VCPU */
-	};
-	u8 source;			/* GICv2 SGIs only */
-	u8 priority;
-	enum vgic_irq_config config;	/* Level or edge */
-};
-
-struct vgic_register_region;
-
-struct vgic_io_device {
-	gpa_t base_addr;
-	struct kvm_vcpu *redist_vcpu;
-	const struct vgic_register_region *regions;
-	int nr_regions;
-	struct kvm_io_device dev;
-};
-
-struct vgic_dist {
-	bool			in_kernel;
-	bool			ready;
-	bool			initialized;
-
-	/* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
-	u32			vgic_model;
-
-	int			nr_spis;
-
-	/* TODO: Consider moving to global state */
-	/* Virtual control interface mapping */
-	void __iomem		*vctrl_base;
-
-	/* base addresses in guest physical address space: */
-	gpa_t			vgic_dist_base;		/* distributor */
-	union {
-		/* either a GICv2 CPU interface */
-		gpa_t			vgic_cpu_base;
-		/* or a number of GICv3 redistributor regions */
-		gpa_t			vgic_redist_base;
-	};
-
-	/* distributor enabled */
-	bool			enabled;
-
-	struct vgic_irq		*spis;
-
-	struct vgic_io_device	dist_iodev;
-	struct vgic_io_device	*redist_iodevs;
-};
-
-struct vgic_v2_cpu_if {
-	u32		vgic_hcr;
-	u32		vgic_vmcr;
-	u32		vgic_misr;	/* Saved only */
-	u64		vgic_eisr;	/* Saved only */
-	u64		vgic_elrsr;	/* Saved only */
-	u32		vgic_apr;
-	u32		vgic_lr[VGIC_V2_MAX_LRS];
-};
-
-struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-	u32		vgic_hcr;
-	u32		vgic_vmcr;
-	u32		vgic_sre;	/* Restored only, change ignored */
-	u32		vgic_misr;	/* Saved only */
-	u32		vgic_eisr;	/* Saved only */
-	u32		vgic_elrsr;	/* Saved only */
-	u32		vgic_ap0r[4];
-	u32		vgic_ap1r[4];
-	u64		vgic_lr[VGIC_V3_MAX_LRS];
-#endif
-};
-
-struct vgic_cpu {
-	/* CPU vif control registers for world switch */
-	union {
-		struct vgic_v2_cpu_if	vgic_v2;
-		struct vgic_v3_cpu_if	vgic_v3;
-	};
-
-	unsigned int used_lrs;
-	struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
-
-	spinlock_t ap_list_lock;	/* Protects the ap_list */
-
-	/*
-	 * List of IRQs that this VCPU should consider because they are either
-	 * Active or Pending (hence the name; AP list), or because they recently
-	 * were one of the two and need to be migrated off this list to another
-	 * VCPU.
-	 */
-	struct list_head ap_list_head;
-
-	u64 live_lrs;
-};
-
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-void kvm_vgic_early_init(struct kvm *kvm);
-int kvm_vgic_create(struct kvm *kvm, u32 type);
-void kvm_vgic_destroy(struct kvm *kvm);
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_hyp_init(void);
-
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			       bool level);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-
-#define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k)	((k)->arch.vgic.initialized)
-#define vgic_ready(k)		((k)->arch.vgic.ready)
-#define vgic_valid_spi(k, i)	(((i) >= VGIC_NR_PRIVATE_IRQS) && \
-			((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
-
-bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-static inline int kvm_vgic_get_max_vcpus(void)
-{
-	return kvm_vgic_global_state.max_gic_vcpus;
-}
-
-#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index 3a3a699b748950..7cffd9338c494b 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -21,18 +21,11 @@
 
 #include <asm/kvm_hyp.h>
 
-#ifdef CONFIG_KVM_NEW_VGIC
-extern struct vgic_global kvm_vgic_global_state;
-#define vgic_v2_params kvm_vgic_global_state
-#else
-extern struct vgic_params vgic_v2_params;
-#endif
-
 static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
 					    void __iomem *base)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+	int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
 	u32 eisr0, eisr1;
 	int i;
 	bool expect_mi;
@@ -74,7 +67,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+	int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
 	u32 elrsr0, elrsr1;
 
 	elrsr0 = readl_relaxed(base + GICH_ELRSR0);
@@ -93,7 +86,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+	int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
 	int i;
 
 	for (i = 0; i < nr_lr; i++) {
@@ -147,7 +140,7 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 	struct vgic_dist *vgic = &kvm->arch.vgic;
 	void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-	int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+	int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
 	int i;
 	u64 live_lrs = 0;
 
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
deleted file mode 100644
index 1b0bee095427ec..00000000000000
--- a/virt/kvm/arm/vgic-v2-emul.c
+++ /dev/null
@@ -1,856 +0,0 @@
-/*
- * Contains GICv2 specific emulation code, was in vgic.c before.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-#define GICC_ARCH_VERSION_V2		0x2
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
-static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
-{
-	return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
-}
-
-static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
-			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	u32 reg;
-	u32 word_offset = offset & 3;
-
-	switch (offset & ~3) {
-	case 0:			/* GICD_CTLR */
-		reg = vcpu->kvm->arch.vgic.enabled;
-		vgic_reg_access(mmio, &reg, word_offset,
-				ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-		if (mmio->is_write) {
-			vcpu->kvm->arch.vgic.enabled = reg & 1;
-			vgic_update_state(vcpu->kvm);
-			return true;
-		}
-		break;
-
-	case 4:			/* GICD_TYPER */
-		reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-		reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
-		vgic_reg_access(mmio, &reg, word_offset,
-				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-		break;
-
-	case 8:			/* GICD_IIDR */
-		reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-		vgic_reg_access(mmio, &reg, word_offset,
-				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-		break;
-	}
-
-	return false;
-}
-
-static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
-				       struct kvm_exit_mmio *mmio,
-				       phys_addr_t offset)
-{
-	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-				      vcpu->vcpu_id, ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
-					 struct kvm_exit_mmio *mmio,
-					 phys_addr_t offset)
-{
-	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-				      vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
-					struct kvm_exit_mmio *mmio,
-					phys_addr_t offset)
-{
-	return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-					   vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
-					  struct kvm_exit_mmio *mmio,
-					  phys_addr_t offset)
-{
-	return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-					     vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
-				       struct kvm_exit_mmio *mmio,
-				       phys_addr_t offset)
-{
-	return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-					  vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
-					 struct kvm_exit_mmio *mmio,
-					 phys_addr_t offset)
-{
-	return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-					    vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
-				     struct kvm_exit_mmio *mmio,
-				     phys_addr_t offset)
-{
-	u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-					vcpu->vcpu_id, offset);
-	vgic_reg_access(mmio, reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-	return false;
-}
-
-#define GICD_ITARGETSR_SIZE	32
-#define GICD_CPUTARGETS_BITS	8
-#define GICD_IRQS_PER_ITARGETSR	(GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
-static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int i;
-	u32 val = 0;
-
-	irq -= VGIC_NR_PRIVATE_IRQS;
-
-	for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-		val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
-
-	return val;
-}
-
-static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int i, c;
-	unsigned long *bmap;
-	u32 target;
-
-	irq -= VGIC_NR_PRIVATE_IRQS;
-
-	/*
-	 * Pick the LSB in each byte. This ensures we target exactly
-	 * one vcpu per IRQ. If the byte is null, assume we target
-	 * CPU0.
-	 */
-	for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
-		int shift = i * GICD_CPUTARGETS_BITS;
-
-		target = ffs((val >> shift) & 0xffU);
-		target = target ? (target - 1) : 0;
-		dist->irq_spi_cpu[irq + i] = target;
-		kvm_for_each_vcpu(c, vcpu, kvm) {
-			bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
-			if (c == target)
-				set_bit(irq + i, bmap);
-			else
-				clear_bit(irq + i, bmap);
-		}
-	}
-}
-
-static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
-				   struct kvm_exit_mmio *mmio,
-				   phys_addr_t offset)
-{
-	u32 reg;
-
-	/* We treat the banked interrupts targets as read-only */
-	if (offset < 32) {
-		u32 roreg;
-
-		roreg = 1 << vcpu->vcpu_id;
-		roreg |= roreg << 8;
-		roreg |= roreg << 16;
-
-		vgic_reg_access(mmio, &roreg, offset,
-				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-		return false;
-	}
-
-	reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-	if (mmio->is_write) {
-		vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
-		vgic_update_state(vcpu->kvm);
-		return true;
-	}
-
-	return false;
-}
-
-static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
-				struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	u32 *reg;
-
-	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-				  vcpu->vcpu_id, offset >> 1);
-
-	return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
-				struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	u32 reg;
-
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
-	if (mmio->is_write) {
-		vgic_dispatch_sgi(vcpu, reg);
-		vgic_update_state(vcpu->kvm);
-		return true;
-	}
-
-	return false;
-}
-
-/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
-static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
-					struct kvm_exit_mmio *mmio,
-					phys_addr_t offset)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	int sgi;
-	int min_sgi = (offset & ~0x3);
-	int max_sgi = min_sgi + 3;
-	int vcpu_id = vcpu->vcpu_id;
-	u32 reg = 0;
-
-	/* Copy source SGIs from distributor side */
-	for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-		u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
-		reg |= ((u32)sources) << (8 * (sgi - min_sgi));
-	}
-
-	mmio_data_write(mmio, ~0, reg);
-	return false;
-}
-
-static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
-					 struct kvm_exit_mmio *mmio,
-					 phys_addr_t offset, bool set)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	int sgi;
-	int min_sgi = (offset & ~0x3);
-	int max_sgi = min_sgi + 3;
-	int vcpu_id = vcpu->vcpu_id;
-	u32 reg;
-	bool updated = false;
-
-	reg = mmio_data_read(mmio, ~0);
-
-	/* Clear pending SGIs on the distributor */
-	for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
-		u8 mask = reg >> (8 * (sgi - min_sgi));
-		u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
-		if (set) {
-			if ((*src & mask) != mask)
-				updated = true;
-			*src |= mask;
-		} else {
-			if (*src & mask)
-				updated = true;
-			*src &= ~mask;
-		}
-	}
-
-	if (updated)
-		vgic_update_state(vcpu->kvm);
-
-	return updated;
-}
-
-static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
-				struct kvm_exit_mmio *mmio,
-				phys_addr_t offset)
-{
-	if (!mmio->is_write)
-		return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-	else
-		return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
-}
-
-static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
-				  struct kvm_exit_mmio *mmio,
-				  phys_addr_t offset)
-{
-	if (!mmio->is_write)
-		return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
-	else
-		return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
-}
-
-static const struct vgic_io_range vgic_dist_ranges[] = {
-	{
-		.base		= GIC_DIST_SOFTINT,
-		.len		= 4,
-		.handle_mmio	= handle_mmio_sgi_reg,
-	},
-	{
-		.base		= GIC_DIST_CTRL,
-		.len		= 12,
-		.bits_per_irq	= 0,
-		.handle_mmio	= handle_mmio_misc,
-	},
-	{
-		.base		= GIC_DIST_IGROUP,
-		.len		= VGIC_MAX_IRQS / 8,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		.base		= GIC_DIST_ENABLE_SET,
-		.len		= VGIC_MAX_IRQS / 8,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_enable_reg,
-	},
-	{
-		.base		= GIC_DIST_ENABLE_CLEAR,
-		.len		= VGIC_MAX_IRQS / 8,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_enable_reg,
-	},
-	{
-		.base		= GIC_DIST_PENDING_SET,
-		.len		= VGIC_MAX_IRQS / 8,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_pending_reg,
-	},
-	{
-		.base		= GIC_DIST_PENDING_CLEAR,
-		.len		= VGIC_MAX_IRQS / 8,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_pending_reg,
-	},
-	{
-		.base		= GIC_DIST_ACTIVE_SET,
-		.len		= VGIC_MAX_IRQS / 8,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_active_reg,
-	},
-	{
-		.base		= GIC_DIST_ACTIVE_CLEAR,
-		.len		= VGIC_MAX_IRQS / 8,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_active_reg,
-	},
-	{
-		.base		= GIC_DIST_PRI,
-		.len		= VGIC_MAX_IRQS,
-		.bits_per_irq	= 8,
-		.handle_mmio	= handle_mmio_priority_reg,
-	},
-	{
-		.base		= GIC_DIST_TARGET,
-		.len		= VGIC_MAX_IRQS,
-		.bits_per_irq	= 8,
-		.handle_mmio	= handle_mmio_target_reg,
-	},
-	{
-		.base		= GIC_DIST_CONFIG,
-		.len		= VGIC_MAX_IRQS / 4,
-		.bits_per_irq	= 2,
-		.handle_mmio	= handle_mmio_cfg_reg,
-	},
-	{
-		.base		= GIC_DIST_SGI_PENDING_CLEAR,
-		.len		= VGIC_NR_SGIS,
-		.handle_mmio	= handle_mmio_sgi_clear,
-	},
-	{
-		.base		= GIC_DIST_SGI_PENDING_SET,
-		.len		= VGIC_NR_SGIS,
-		.handle_mmio	= handle_mmio_sgi_set,
-	},
-	{}
-};
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int nrcpus = atomic_read(&kvm->online_vcpus);
-	u8 target_cpus;
-	int sgi, mode, c, vcpu_id;
-
-	vcpu_id = vcpu->vcpu_id;
-
-	sgi = reg & 0xf;
-	target_cpus = (reg >> 16) & 0xff;
-	mode = (reg >> 24) & 3;
-
-	switch (mode) {
-	case 0:
-		if (!target_cpus)
-			return;
-		break;
-
-	case 1:
-		target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
-		break;
-
-	case 2:
-		target_cpus = 1 << vcpu_id;
-		break;
-	}
-
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		if (target_cpus & 1) {
-			/* Flag the SGI as pending */
-			vgic_dist_irq_set_pending(vcpu, sgi);
-			*vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
-			kvm_debug("SGI%d from CPU%d to CPU%d\n",
-				  sgi, vcpu_id, c);
-		}
-
-		target_cpus >>= 1;
-	}
-}
-
-static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	unsigned long sources;
-	int vcpu_id = vcpu->vcpu_id;
-	int c;
-
-	sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
-
-	for_each_set_bit(c, &sources, dist->nr_cpus) {
-		if (vgic_queue_irq(vcpu, c, irq))
-			clear_bit(c, &sources);
-	}
-
-	*vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
-
-	/*
-	 * If the sources bitmap has been cleared it means that we
-	 * could queue all the SGIs onto link registers (see the
-	 * clear_bit above), and therefore we are done with them in
-	 * our emulated gic and can get rid of them.
-	 */
-	if (!sources) {
-		vgic_dist_irq_clear_pending(vcpu, irq);
-		vgic_cpu_irq_clear(vcpu, irq);
-		return true;
-	}
-
-	return false;
-}
-
-/**
- * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
- * @kvm: pointer to the kvm struct
- *
- * Map the virtual CPU interface into the VM before running any VCPUs.  We
- * can't do this at creation time, because user space must first set the
- * virtual CPU interface address in the guest physical address space.
- */
-static int vgic_v2_map_resources(struct kvm *kvm,
-				 const struct vgic_params *params)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int ret = 0;
-
-	if (!irqchip_in_kernel(kvm))
-		return 0;
-
-	mutex_lock(&kvm->lock);
-
-	if (vgic_ready(kvm))
-		goto out;
-
-	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-	    IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
-		kvm_err("Need to set vgic cpu and dist addresses first\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
-				 KVM_VGIC_V2_DIST_SIZE,
-				 vgic_dist_ranges, -1, &dist->dist_iodev);
-
-	/*
-	 * Initialize the vgic if this hasn't already been done on demand by
-	 * accessing the vgic state from userspace.
-	 */
-	ret = vgic_init(kvm);
-	if (ret) {
-		kvm_err("Unable to allocate maps\n");
-		goto out_unregister;
-	}
-
-	ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-				    params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
-				    true);
-	if (ret) {
-		kvm_err("Unable to remap VGIC CPU to VCPU\n");
-		goto out_unregister;
-	}
-
-	dist->ready = true;
-	goto out;
-
-out_unregister:
-	kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-
-out:
-	if (ret)
-		kvm_vgic_destroy(kvm);
-	mutex_unlock(&kvm->lock);
-	return ret;
-}
-
-static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	*vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source;
-}
-
-static int vgic_v2_init_model(struct kvm *kvm)
-{
-	int i;
-
-	for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
-		vgic_set_target_reg(kvm, 0, i);
-
-	return 0;
-}
-
-void vgic_v2_init_emulation(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
-	dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
-	dist->vm_ops.init_model = vgic_v2_init_model;
-	dist->vm_ops.map_resources = vgic_v2_map_resources;
-
-	kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-}
-
-static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
-				 struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	bool updated = false;
-	struct vgic_vmcr vmcr;
-	u32 *vmcr_field;
-	u32 reg;
-
-	vgic_get_vmcr(vcpu, &vmcr);
-
-	switch (offset & ~0x3) {
-	case GIC_CPU_CTRL:
-		vmcr_field = &vmcr.ctlr;
-		break;
-	case GIC_CPU_PRIMASK:
-		vmcr_field = &vmcr.pmr;
-		break;
-	case GIC_CPU_BINPOINT:
-		vmcr_field = &vmcr.bpr;
-		break;
-	case GIC_CPU_ALIAS_BINPOINT:
-		vmcr_field = &vmcr.abpr;
-		break;
-	default:
-		BUG();
-	}
-
-	if (!mmio->is_write) {
-		reg = *vmcr_field;
-		mmio_data_write(mmio, ~0, reg);
-	} else {
-		reg = mmio_data_read(mmio, ~0);
-		if (reg != *vmcr_field) {
-			*vmcr_field = reg;
-			vgic_set_vmcr(vcpu, &vmcr);
-			updated = true;
-		}
-	}
-	return updated;
-}
-
-static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
-			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
-}
-
-static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
-				  struct kvm_exit_mmio *mmio,
-				  phys_addr_t offset)
-{
-	u32 reg;
-
-	if (mmio->is_write)
-		return false;
-
-	/* GICC_IIDR */
-	reg = (PRODUCT_ID_KVM << 20) |
-	      (GICC_ARCH_VERSION_V2 << 16) |
-	      (IMPLEMENTER_ARM << 0);
-	mmio_data_write(mmio, ~0, reg);
-	return false;
-}
-
-/*
- * CPU Interface Register accesses - these are not accessed by the VM, but by
- * user space for saving and restoring VGIC state.
- */
-static const struct vgic_io_range vgic_cpu_ranges[] = {
-	{
-		.base		= GIC_CPU_CTRL,
-		.len		= 12,
-		.handle_mmio	= handle_cpu_mmio_misc,
-	},
-	{
-		.base		= GIC_CPU_ALIAS_BINPOINT,
-		.len		= 4,
-		.handle_mmio	= handle_mmio_abpr,
-	},
-	{
-		.base		= GIC_CPU_ACTIVEPRIO,
-		.len		= 16,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		.base		= GIC_CPU_IDENT,
-		.len		= 4,
-		.handle_mmio	= handle_cpu_mmio_ident,
-	},
-};
-
-static int vgic_attr_regs_access(struct kvm_device *dev,
-				 struct kvm_device_attr *attr,
-				 u32 *reg, bool is_write)
-{
-	const struct vgic_io_range *r = NULL, *ranges;
-	phys_addr_t offset;
-	int ret, cpuid, c;
-	struct kvm_vcpu *vcpu, *tmp_vcpu;
-	struct vgic_dist *vgic;
-	struct kvm_exit_mmio mmio;
-	u32 data;
-
-	offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-	cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
-		KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
-	mutex_lock(&dev->kvm->lock);
-
-	ret = vgic_init(dev->kvm);
-	if (ret)
-		goto out;
-
-	if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-	vgic = &dev->kvm->arch.vgic;
-
-	mmio.len = 4;
-	mmio.is_write = is_write;
-	mmio.data = &data;
-	if (is_write)
-		mmio_data_write(&mmio, ~0, *reg);
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		mmio.phys_addr = vgic->vgic_dist_base + offset;
-		ranges = vgic_dist_ranges;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		mmio.phys_addr = vgic->vgic_cpu_base + offset;
-		ranges = vgic_cpu_ranges;
-		break;
-	default:
-		BUG();
-	}
-	r = vgic_find_range(ranges, 4, offset);
-
-	if (unlikely(!r || !r->handle_mmio)) {
-		ret = -ENXIO;
-		goto out;
-	}
-
-
-	spin_lock(&vgic->lock);
-
-	/*
-	 * Ensure that no other VCPU is running by checking the vcpu->cpu
-	 * field.  If no other VPCUs are running we can safely access the VGIC
-	 * state, because even if another VPU is run after this point, that
-	 * VCPU will not touch the vgic state, because it will block on
-	 * getting the vgic->lock in kvm_vgic_sync_hwstate().
-	 */
-	kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
-		if (unlikely(tmp_vcpu->cpu != -1)) {
-			ret = -EBUSY;
-			goto out_vgic_unlock;
-		}
-	}
-
-	/*
-	 * Move all pending IRQs from the LRs on all VCPUs so the pending
-	 * state can be properly represented in the register state accessible
-	 * through this API.
-	 */
-	kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
-		vgic_unqueue_irqs(tmp_vcpu);
-
-	offset -= r->base;
-	r->handle_mmio(vcpu, &mmio, offset);
-
-	if (!is_write)
-		*reg = mmio_data_read(&mmio, ~0);
-
-	ret = 0;
-out_vgic_unlock:
-	spin_unlock(&vgic->lock);
-out:
-	mutex_unlock(&dev->kvm->lock);
-	return ret;
-}
-
-static int vgic_v2_create(struct kvm_device *dev, u32 type)
-{
-	return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v2_destroy(struct kvm_device *dev)
-{
-	kfree(dev);
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_set_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 reg;
-
-		if (get_user(reg, uaddr))
-			return -EFAULT;
-
-		return vgic_attr_regs_access(dev, attr, &reg, true);
-	}
-
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_get_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 reg = 0;
-
-		ret = vgic_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		return put_user(reg, uaddr);
-	}
-
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	phys_addr_t offset;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_V2_ADDR_TYPE_DIST:
-		case KVM_VGIC_V2_ADDR_TYPE_CPU:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-		return vgic_has_attr_regs(vgic_dist_ranges, offset);
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-		return vgic_has_attr_regs(vgic_cpu_ranges, offset);
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-		return 0;
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		}
-	}
-	return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
-	.name = "kvm-arm-vgic-v2",
-	.create = vgic_v2_create,
-	.destroy = vgic_v2_destroy,
-	.set_attr = vgic_v2_set_attr,
-	.get_attr = vgic_v2_get_attr,
-	.has_attr = vgic_v2_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
deleted file mode 100644
index 334cd7a891066d..00000000000000
--- a/virt/kvm/arm/vgic-v2.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-	struct vgic_lr lr_desc;
-	u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr];
-
-	lr_desc.irq	= val & GICH_LR_VIRTUALID;
-	if (lr_desc.irq <= 15)
-		lr_desc.source	= (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-	else
-		lr_desc.source = 0;
-	lr_desc.state	= 0;
-
-	if (val & GICH_LR_PENDING_BIT)
-		lr_desc.state |= LR_STATE_PENDING;
-	if (val & GICH_LR_ACTIVE_BIT)
-		lr_desc.state |= LR_STATE_ACTIVE;
-	if (val & GICH_LR_EOI)
-		lr_desc.state |= LR_EOI_INT;
-	if (val & GICH_LR_HW) {
-		lr_desc.state |= LR_HW;
-		lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
-	}
-
-	return lr_desc;
-}
-
-static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
-			   struct vgic_lr lr_desc)
-{
-	u32 lr_val;
-
-	lr_val = lr_desc.irq;
-
-	if (lr_desc.state & LR_STATE_PENDING)
-		lr_val |= GICH_LR_PENDING_BIT;
-	if (lr_desc.state & LR_STATE_ACTIVE)
-		lr_val |= GICH_LR_ACTIVE_BIT;
-	if (lr_desc.state & LR_EOI_INT)
-		lr_val |= GICH_LR_EOI;
-
-	if (lr_desc.state & LR_HW) {
-		lr_val |= GICH_LR_HW;
-		lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
-	}
-
-	if (lr_desc.irq < VGIC_NR_SGIS)
-		lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
-
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
-
-	if (!(lr_desc.state & LR_STATE_MASK))
-		vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
-	else
-		vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr);
-}
-
-static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr;
-}
-
-static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr;
-}
-
-static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0;
-}
-
-static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
-	u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr;
-	u32 ret = 0;
-
-	if (misr & GICH_MISR_EOI)
-		ret |= INT_STATUS_EOI;
-	if (misr & GICH_MISR_U)
-		ret |= INT_STATUS_UNDERFLOW;
-
-	return ret;
-}
-
-static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE;
-}
-
-static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE;
-}
-
-static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
-
-	vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT;
-	vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT;
-	vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT;
-	vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT;
-}
-
-static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	u32 vmcr;
-
-	vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
-	vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK;
-	vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK;
-	vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
-}
-
-static void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * By forcing VMCR to zero, the GIC will restore the binary
-	 * points to their reset values. Anything else resets to zero
-	 * anyway.
-	 */
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
-
-	/* Get the show on the road... */
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v2_ops = {
-	.get_lr			= vgic_v2_get_lr,
-	.set_lr			= vgic_v2_set_lr,
-	.get_elrsr		= vgic_v2_get_elrsr,
-	.get_eisr		= vgic_v2_get_eisr,
-	.clear_eisr		= vgic_v2_clear_eisr,
-	.get_interrupt_status	= vgic_v2_get_interrupt_status,
-	.enable_underflow	= vgic_v2_enable_underflow,
-	.disable_underflow	= vgic_v2_disable_underflow,
-	.get_vmcr		= vgic_v2_get_vmcr,
-	.set_vmcr		= vgic_v2_set_vmcr,
-	.enable			= vgic_v2_enable,
-};
-
-struct vgic_params __section(.hyp.text) vgic_v2_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
-	struct vgic_params *vgic = params;
-	int i;
-
-	for (i = 0; i < vgic->nr_lr; i++)
-		writel_relaxed(0, vgic->vctrl_base + GICH_LR0 + (i * 4));
-}
-
-/**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller
- * @gic_kvm_info:	pointer to the GIC description
- * @ops:		address of a pointer to the GICv2 operations
- * @params:		address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv2 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
-		   const struct vgic_ops **ops,
-		   const struct vgic_params **params)
-{
-	int ret;
-	struct vgic_params *vgic = &vgic_v2_params;
-	const struct resource *vctrl_res = &gic_kvm_info->vctrl;
-	const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
-	memset(vgic, 0, sizeof(*vgic));
-
-	if (!gic_kvm_info->maint_irq) {
-		kvm_err("error getting vgic maintenance irq\n");
-		ret = -ENXIO;
-		goto out;
-	}
-	vgic->maint_irq = gic_kvm_info->maint_irq;
-
-	if (!gic_kvm_info->vctrl.start) {
-		kvm_err("GICH not present in the firmware table\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start,
-				   resource_size(&gic_kvm_info->vctrl));
-	if (!vgic->vctrl_base) {
-		kvm_err("Cannot ioremap GICH\n");
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR);
-	vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;
-
-	ret = create_hyp_io_mappings(vgic->vctrl_base,
-				     vgic->vctrl_base + resource_size(vctrl_res),
-				     vctrl_res->start);
-	if (ret) {
-		kvm_err("Cannot map VCTRL into hyp\n");
-		goto out_unmap;
-	}
-
-	if (!PAGE_ALIGNED(vcpu_res->start)) {
-		kvm_err("GICV physical address 0x%llx not page aligned\n",
-			(unsigned long long)vcpu_res->start);
-		ret = -ENXIO;
-		goto out_unmap;
-	}
-
-	if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
-		kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-			(unsigned long long)resource_size(vcpu_res),
-			PAGE_SIZE);
-		ret = -ENXIO;
-		goto out_unmap;
-	}
-
-	vgic->can_emulate_gicv2 = true;
-	kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
-
-	vgic->vcpu_base = vcpu_res->start;
-
-	kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n",
-		 gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq);
-
-	vgic->type = VGIC_V2;
-	vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
-	on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
-	*ops = &vgic_v2_ops;
-	*params = vgic;
-	goto out;
-
-out_unmap:
-	iounmap(vgic->vctrl_base);
-out:
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
deleted file mode 100644
index e661e7fb9d9187..00000000000000
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ /dev/null
@@ -1,1074 +0,0 @@
-/*
- * GICv3 distributor and redistributor emulation
- *
- * GICv3 emulation is currently only supported on a GICv3 host (because
- * we rely on the hardware's CPU interface virtualization support), but
- * supports both hardware with or without the optional GICv2 backwards
- * compatibility features.
- *
- * Limitations of the emulation:
- * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
- * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
- * - We do not support the message based interrupts (MBIs) triggered by
- *   writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
- * - We do not support the (optional) backwards compatibility feature.
- *   GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
- *   the compatiblity feature, you can use a GICv2 in the guest, though.
- * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
- * - Priorities are not emulated (same as the GICv2 emulation). Linux
- *   as a guest is fine with this, because it does not use priorities.
- * - We only support Group1 interrupts. Again Linux uses only those.
- *
- * Copyright (C) 2014 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
-			       struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	u32 reg = 0xffffffff;
-
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-	return false;
-}
-
-static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
-			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	u32 reg = 0;
-
-	/*
-	 * Force ARE and DS to 1, the guest cannot change this.
-	 * For the time being we only support Group1 interrupts.
-	 */
-	if (vcpu->kvm->arch.vgic.enabled)
-		reg = GICD_CTLR_ENABLE_SS_G1;
-	reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-	if (mmio->is_write) {
-		vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
-		vgic_update_state(vcpu->kvm);
-		return true;
-	}
-	return false;
-}
-
-/*
- * As this implementation does not provide compatibility
- * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
- * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
- * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
- */
-#define INTERRUPT_ID_BITS 10
-static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
-			      struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	u32 reg;
-
-	reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
-
-	reg |= (INTERRUPT_ID_BITS - 1) << 19;
-
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-	return false;
-}
-
-static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
-			     struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
-	u32 reg;
-
-	reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-	return false;
-}
-
-static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
-					    struct kvm_exit_mmio *mmio,
-					    phys_addr_t offset)
-{
-	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-					      vcpu->vcpu_id,
-					      ACCESS_WRITE_SETBIT);
-
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
-					      struct kvm_exit_mmio *mmio,
-					      phys_addr_t offset)
-{
-	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-		return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-					      vcpu->vcpu_id,
-					      ACCESS_WRITE_CLEARBIT);
-
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
-					     struct kvm_exit_mmio *mmio,
-					     phys_addr_t offset)
-{
-	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-		return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-						   vcpu->vcpu_id);
-
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
-					       struct kvm_exit_mmio *mmio,
-					       phys_addr_t offset)
-{
-	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-		return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-						     vcpu->vcpu_id);
-
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu,
-					    struct kvm_exit_mmio *mmio,
-					    phys_addr_t offset)
-{
-	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-		return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-						   vcpu->vcpu_id);
-
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu,
-					      struct kvm_exit_mmio *mmio,
-					      phys_addr_t offset)
-{
-	if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
-		return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-						    vcpu->vcpu_id);
-
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
-					  struct kvm_exit_mmio *mmio,
-					  phys_addr_t offset)
-{
-	u32 *reg;
-
-	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
-		vgic_reg_access(mmio, NULL, offset,
-				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-		return false;
-	}
-
-	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-				   vcpu->vcpu_id, offset);
-	vgic_reg_access(mmio, reg, offset,
-		ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-	return false;
-}
-
-static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
-				     struct kvm_exit_mmio *mmio,
-				     phys_addr_t offset)
-{
-	u32 *reg;
-
-	if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
-		vgic_reg_access(mmio, NULL, offset,
-				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-		return false;
-	}
-
-	reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-				  vcpu->vcpu_id, offset >> 1);
-
-	return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-/*
- * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
- * when we store the target MPIDR written by the guest.
- */
-static u32 compress_mpidr(unsigned long mpidr)
-{
-	u32 ret;
-
-	ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
-	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
-	ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
-
-	return ret;
-}
-
-static unsigned long uncompress_mpidr(u32 value)
-{
-	unsigned long mpidr;
-
-	mpidr  = ((value >>  0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
-	mpidr |= ((value >>  8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
-	mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
-	mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
-
-	return mpidr;
-}
-
-/*
- * Lookup the given MPIDR value to get the vcpu_id (if there is one)
- * and store that in the irq_spi_cpu[] array.
- * This limits the number of VCPUs to 255 for now, extending the data
- * type (or storing kvm_vcpu pointers) should lift the limit.
- * Store the original MPIDR value in an extra array to support read-as-written.
- * Unallocated MPIDRs are translated to a special value and caught
- * before any array accesses.
- */
-static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
-				  struct kvm_exit_mmio *mmio,
-				  phys_addr_t offset)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int spi;
-	u32 reg;
-	int vcpu_id;
-	unsigned long *bmap, mpidr;
-
-	/*
-	 * The upper 32 bits of each 64 bit register are zero,
-	 * as we don't support Aff3.
-	 */
-	if ((offset & 4)) {
-		vgic_reg_access(mmio, NULL, offset,
-				ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-		return false;
-	}
-
-	/* This region only covers SPIs, so no handling of private IRQs here. */
-	spi = offset / 8;
-
-	/* get the stored MPIDR for this IRQ */
-	mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
-	reg = mpidr;
-
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-
-	if (!mmio->is_write)
-		return false;
-
-	/*
-	 * Now clear the currently assigned vCPU from the map, making room
-	 * for the new one to be written below
-	 */
-	vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
-	if (likely(vcpu)) {
-		vcpu_id = vcpu->vcpu_id;
-		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
-		__clear_bit(spi, bmap);
-	}
-
-	dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
-	vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
-
-	/*
-	 * The spec says that non-existent MPIDR values should not be
-	 * forwarded to any existent (v)CPU, but should be able to become
-	 * pending anyway. We simply keep the irq_spi_target[] array empty, so
-	 * the interrupt will never be injected.
-	 * irq_spi_cpu[irq] gets a magic value in this case.
-	 */
-	if (likely(vcpu)) {
-		vcpu_id = vcpu->vcpu_id;
-		dist->irq_spi_cpu[spi] = vcpu_id;
-		bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
-		__set_bit(spi, bmap);
-	} else {
-		dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
-	}
-
-	vgic_update_state(kvm);
-
-	return true;
-}
-
-/*
- * We should be careful about promising too much when a guest reads
- * this register. Don't claim to be like any hardware implementation,
- * but just report the GIC as version 3 - which is what a Linux guest
- * would check.
- */
-static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
-			       struct kvm_exit_mmio *mmio,
-			       phys_addr_t offset)
-{
-	u32 reg = 0;
-
-	switch (offset + GICD_IDREGS) {
-	case GICD_PIDR2:
-		reg = 0x3b;
-		break;
-	}
-
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
-	return false;
-}
-
-static const struct vgic_io_range vgic_v3_dist_ranges[] = {
-	{
-		.base           = GICD_CTLR,
-		.len            = 0x04,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_ctlr,
-	},
-	{
-		.base           = GICD_TYPER,
-		.len            = 0x04,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_typer,
-	},
-	{
-		.base           = GICD_IIDR,
-		.len            = 0x04,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_iidr,
-	},
-	{
-		/* this register is optional, it is RAZ/WI if not implemented */
-		.base           = GICD_STATUSR,
-		.len            = 0x04,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_raz_wi,
-	},
-	{
-		/* this write only register is WI when TYPER.MBIS=0 */
-		.base		= GICD_SETSPI_NSR,
-		.len		= 0x04,
-		.bits_per_irq	= 0,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		/* this write only register is WI when TYPER.MBIS=0 */
-		.base		= GICD_CLRSPI_NSR,
-		.len		= 0x04,
-		.bits_per_irq	= 0,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		/* this is RAZ/WI when DS=1 */
-		.base		= GICD_SETSPI_SR,
-		.len		= 0x04,
-		.bits_per_irq	= 0,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		/* this is RAZ/WI when DS=1 */
-		.base		= GICD_CLRSPI_SR,
-		.len		= 0x04,
-		.bits_per_irq	= 0,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		.base		= GICD_IGROUPR,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_rao_wi,
-	},
-	{
-		.base		= GICD_ISENABLER,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_enable_reg_dist,
-	},
-	{
-		.base		= GICD_ICENABLER,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_enable_reg_dist,
-	},
-	{
-		.base		= GICD_ISPENDR,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_pending_reg_dist,
-	},
-	{
-		.base		= GICD_ICPENDR,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_pending_reg_dist,
-	},
-	{
-		.base		= GICD_ISACTIVER,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_active_reg_dist,
-	},
-	{
-		.base		= GICD_ICACTIVER,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_active_reg_dist,
-	},
-	{
-		.base		= GICD_IPRIORITYR,
-		.len		= 0x400,
-		.bits_per_irq	= 8,
-		.handle_mmio	= handle_mmio_priority_reg_dist,
-	},
-	{
-		/* TARGETSRn is RES0 when ARE=1 */
-		.base		= GICD_ITARGETSR,
-		.len		= 0x400,
-		.bits_per_irq	= 8,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		.base		= GICD_ICFGR,
-		.len		= 0x100,
-		.bits_per_irq	= 2,
-		.handle_mmio	= handle_mmio_cfg_reg_dist,
-	},
-	{
-		/* this is RAZ/WI when DS=1 */
-		.base		= GICD_IGRPMODR,
-		.len		= 0x80,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		/* this is RAZ/WI when DS=1 */
-		.base		= GICD_NSACR,
-		.len		= 0x100,
-		.bits_per_irq	= 2,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		/* this is RAZ/WI when ARE=1 */
-		.base		= GICD_SGIR,
-		.len		= 0x04,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		/* this is RAZ/WI when ARE=1 */
-		.base		= GICD_CPENDSGIR,
-		.len		= 0x10,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		/* this is RAZ/WI when ARE=1 */
-		.base           = GICD_SPENDSGIR,
-		.len            = 0x10,
-		.handle_mmio    = handle_mmio_raz_wi,
-	},
-	{
-		.base		= GICD_IROUTER + 0x100,
-		.len		= 0x1ee0,
-		.bits_per_irq	= 64,
-		.handle_mmio	= handle_mmio_route_reg,
-	},
-	{
-		.base           = GICD_IDREGS,
-		.len            = 0x30,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_idregs,
-	},
-	{},
-};
-
-static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
-				    struct kvm_exit_mmio *mmio,
-				    phys_addr_t offset)
-{
-	/* since we don't support LPIs, this register is zero for now */
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
-				     struct kvm_exit_mmio *mmio,
-				     phys_addr_t offset)
-{
-	u32 reg;
-	u64 mpidr;
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-	int target_vcpu_id = redist_vcpu->vcpu_id;
-
-	/* the upper 32 bits contain the affinity value */
-	if ((offset & ~3) == 4) {
-		mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
-		reg = compress_mpidr(mpidr);
-
-		vgic_reg_access(mmio, &reg, offset,
-				ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-		return false;
-	}
-
-	reg = redist_vcpu->vcpu_id << 8;
-	if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
-		reg |= GICR_TYPER_LAST;
-	vgic_reg_access(mmio, &reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
-					      struct kvm_exit_mmio *mmio,
-					      phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-
-	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-				      redist_vcpu->vcpu_id,
-				      ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
-						struct kvm_exit_mmio *mmio,
-						phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-
-	return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
-				      redist_vcpu->vcpu_id,
-				      ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu,
-					      struct kvm_exit_mmio *mmio,
-					      phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-
-	return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
-					  redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu,
-						struct kvm_exit_mmio *mmio,
-						phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-
-	return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
-					     redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
-					       struct kvm_exit_mmio *mmio,
-					       phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-
-	return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
-					   redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
-						 struct kvm_exit_mmio *mmio,
-						 phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-
-	return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
-					     redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
-					    struct kvm_exit_mmio *mmio,
-					    phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-	u32 *reg;
-
-	reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
-				   redist_vcpu->vcpu_id, offset);
-	vgic_reg_access(mmio, reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-	return false;
-}
-
-static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
-				       struct kvm_exit_mmio *mmio,
-				       phys_addr_t offset)
-{
-	struct kvm_vcpu *redist_vcpu = mmio->private;
-
-	u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
-				       redist_vcpu->vcpu_id, offset >> 1);
-
-	return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-#define SGI_base(x) ((x) + SZ_64K)
-
-static const struct vgic_io_range vgic_redist_ranges[] = {
-	{
-		.base           = GICR_CTLR,
-		.len            = 0x04,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_ctlr_redist,
-	},
-	{
-		.base           = GICR_TYPER,
-		.len            = 0x08,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_typer_redist,
-	},
-	{
-		.base           = GICR_IIDR,
-		.len            = 0x04,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_iidr,
-	},
-	{
-		.base           = GICR_WAKER,
-		.len            = 0x04,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_raz_wi,
-	},
-	{
-		.base           = GICR_IDREGS,
-		.len            = 0x30,
-		.bits_per_irq   = 0,
-		.handle_mmio    = handle_mmio_idregs,
-	},
-	{
-		.base		= SGI_base(GICR_IGROUPR0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_rao_wi,
-	},
-	{
-		.base		= SGI_base(GICR_ISENABLER0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_enable_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_ICENABLER0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_enable_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_ISPENDR0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_pending_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_ICPENDR0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_pending_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_ISACTIVER0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_set_active_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_ICACTIVER0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_clear_active_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_IPRIORITYR0),
-		.len		= 0x20,
-		.bits_per_irq	= 8,
-		.handle_mmio	= handle_mmio_priority_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_ICFGR0),
-		.len		= 0x08,
-		.bits_per_irq	= 2,
-		.handle_mmio	= handle_mmio_cfg_reg_redist,
-	},
-	{
-		.base		= SGI_base(GICR_IGRPMODR0),
-		.len		= 0x04,
-		.bits_per_irq	= 1,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{
-		.base		= SGI_base(GICR_NSACR),
-		.len		= 0x04,
-		.handle_mmio	= handle_mmio_raz_wi,
-	},
-	{},
-};
-
-static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-	if (vgic_queue_irq(vcpu, 0, irq)) {
-		vgic_dist_irq_clear_pending(vcpu, irq);
-		vgic_cpu_irq_clear(vcpu, irq);
-		return true;
-	}
-
-	return false;
-}
-
-static int vgic_v3_map_resources(struct kvm *kvm,
-				 const struct vgic_params *params)
-{
-	int ret = 0;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	gpa_t rdbase = dist->vgic_redist_base;
-	struct vgic_io_device *iodevs = NULL;
-	int i;
-
-	if (!irqchip_in_kernel(kvm))
-		return 0;
-
-	mutex_lock(&kvm->lock);
-
-	if (vgic_ready(kvm))
-		goto out;
-
-	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-	    IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
-		kvm_err("Need to set vgic distributor addresses first\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	/*
-	 * For a VGICv3 we require the userland to explicitly initialize
-	 * the VGIC before we need to use it.
-	 */
-	if (!vgic_initialized(kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
-				       GIC_V3_DIST_SIZE, vgic_v3_dist_ranges,
-				       -1, &dist->dist_iodev);
-	if (ret)
-		goto out;
-
-	iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL);
-	if (!iodevs) {
-		ret = -ENOMEM;
-		goto out_unregister;
-	}
-
-	for (i = 0; i < dist->nr_cpus; i++) {
-		ret = vgic_register_kvm_io_dev(kvm, rdbase,
-					       SZ_128K, vgic_redist_ranges,
-					       i, &iodevs[i]);
-		if (ret)
-			goto out_unregister;
-		rdbase += GIC_V3_REDIST_SIZE;
-	}
-
-	dist->redist_iodevs = iodevs;
-	dist->ready = true;
-	goto out;
-
-out_unregister:
-	kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-	if (iodevs) {
-		for (i = 0; i < dist->nr_cpus; i++) {
-			if (iodevs[i].dev.ops)
-				kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-							  &iodevs[i].dev);
-		}
-	}
-
-out:
-	if (ret)
-		kvm_vgic_destroy(kvm);
-	mutex_unlock(&kvm->lock);
-	return ret;
-}
-
-static int vgic_v3_init_model(struct kvm *kvm)
-{
-	int i;
-	u32 mpidr;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
-	dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
-				      GFP_KERNEL);
-
-	if (!dist->irq_spi_mpidr)
-		return -ENOMEM;
-
-	/* Initialize the target VCPUs for each IRQ to VCPU 0 */
-	mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
-	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
-		dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
-		dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
-		vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
-	}
-
-	return 0;
-}
-
-/* GICv3 does not keep track of SGI sources anymore. */
-static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-}
-
-void vgic_v3_init_emulation(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
-	dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
-	dist->vm_ops.init_model = vgic_v3_init_model;
-	dist->vm_ops.map_resources = vgic_v3_map_resources;
-
-	kvm->arch.max_vcpus = KVM_MAX_VCPUS;
-}
-
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
-	unsigned long affinity;
-	int level0;
-
-	/*
-	 * Split the current VCPU's MPIDR into affinity level 0 and the
-	 * rest as this is what we have to compare against.
-	 */
-	affinity = kvm_vcpu_get_mpidr_aff(vcpu);
-	level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
-	affinity &= ~MPIDR_LEVEL_MASK;
-
-	/* bail out if the upper three levels don't match */
-	if (sgi_aff != affinity)
-		return -1;
-
-	/* Is this VCPU's bit set in the mask ? */
-	if (!(sgi_cpu_mask & BIT(level0)))
-		return -1;
-
-	return level0;
-}
-
-#define SGI_AFFINITY_LEVEL(reg, level) \
-	((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
-	>> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *c_vcpu;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	u16 target_cpus;
-	u64 mpidr;
-	int sgi, c;
-	int vcpu_id = vcpu->vcpu_id;
-	bool broadcast;
-	int updated = 0;
-
-	sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-	broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
-	target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
-	mpidr = SGI_AFFINITY_LEVEL(reg, 3);
-	mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
-	mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
-	/*
-	 * We take the dist lock here, because we come from the sysregs
-	 * code path and not from the MMIO one (which already takes the lock).
-	 */
-	spin_lock(&dist->lock);
-
-	/*
-	 * We iterate over all VCPUs to find the MPIDRs matching the request.
-	 * If we have handled one CPU, we clear it's bit to detect early
-	 * if we are already finished. This avoids iterating through all
-	 * VCPUs when most of the times we just signal a single VCPU.
-	 */
-	kvm_for_each_vcpu(c, c_vcpu, kvm) {
-
-		/* Exit early if we have dealt with all requested CPUs */
-		if (!broadcast && target_cpus == 0)
-			break;
-
-		 /* Don't signal the calling VCPU */
-		if (broadcast && c == vcpu_id)
-			continue;
-
-		if (!broadcast) {
-			int level0;
-
-			level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-			if (level0 == -1)
-				continue;
-
-			/* remove this matching VCPU from the mask */
-			target_cpus &= ~BIT(level0);
-		}
-
-		/* Flag the SGI as pending */
-		vgic_dist_irq_set_pending(c_vcpu, sgi);
-		updated = 1;
-		kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
-	}
-	if (updated)
-		vgic_update_state(vcpu->kvm);
-	spin_unlock(&dist->lock);
-	if (updated)
-		vgic_kick_vcpus(vcpu->kvm);
-}
-
-static int vgic_v3_create(struct kvm_device *dev, u32 type)
-{
-	return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v3_destroy(struct kvm_device *dev)
-{
-	kfree(dev);
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_set_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		return -ENXIO;
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_get_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		return -ENXIO;
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_V2_ADDR_TYPE_DIST:
-		case KVM_VGIC_V2_ADDR_TYPE_CPU:
-			return -ENXIO;
-		case KVM_VGIC_V3_ADDR_TYPE_DIST:
-		case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		return -ENXIO;
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-		return 0;
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		}
-	}
-	return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
-	.name = "kvm-arm-vgic-v3",
-	.create = vgic_v3_create,
-	.destroy = vgic_v3_destroy,
-	.set_attr = vgic_v3_set_attr,
-	.get_attr = vgic_v3_get_attr,
-	.has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
deleted file mode 100644
index 75b02fa86436ac..00000000000000
--- a/virt/kvm/arm/vgic-v3.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (C) 2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-
-static u32 ich_vtr_el2;
-
-static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-	struct vgic_lr lr_desc;
-	u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
-
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-		lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
-	else
-		lr_desc.irq = val & GICH_LR_VIRTUALID;
-
-	lr_desc.source = 0;
-	if (lr_desc.irq <= 15 &&
-	    vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
-		lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-
-	lr_desc.state = 0;
-
-	if (val & ICH_LR_PENDING_BIT)
-		lr_desc.state |= LR_STATE_PENDING;
-	if (val & ICH_LR_ACTIVE_BIT)
-		lr_desc.state |= LR_STATE_ACTIVE;
-	if (val & ICH_LR_EOI)
-		lr_desc.state |= LR_EOI_INT;
-	if (val & ICH_LR_HW) {
-		lr_desc.state |= LR_HW;
-		lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
-	}
-
-	return lr_desc;
-}
-
-static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
-			   struct vgic_lr lr_desc)
-{
-	u64 lr_val;
-
-	lr_val = lr_desc.irq;
-
-	/*
-	 * Currently all guest IRQs are Group1, as Group0 would result
-	 * in a FIQ in the guest, which it wouldn't expect.
-	 * Eventually we want to make this configurable, so we may revisit
-	 * this in the future.
-	 */
-	switch (vcpu->kvm->arch.vgic.vgic_model) {
-	case KVM_DEV_TYPE_ARM_VGIC_V3:
-		lr_val |= ICH_LR_GROUP;
-		break;
-	case  KVM_DEV_TYPE_ARM_VGIC_V2:
-		if (lr_desc.irq < VGIC_NR_SGIS)
-			lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
-		break;
-	default:
-		BUG();
-	}
-
-	if (lr_desc.state & LR_STATE_PENDING)
-		lr_val |= ICH_LR_PENDING_BIT;
-	if (lr_desc.state & LR_STATE_ACTIVE)
-		lr_val |= ICH_LR_ACTIVE_BIT;
-	if (lr_desc.state & LR_EOI_INT)
-		lr_val |= ICH_LR_EOI;
-	if (lr_desc.state & LR_HW) {
-		lr_val |= ICH_LR_HW;
-		lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
-	}
-
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = lr_val;
-
-	if (!(lr_desc.state & LR_STATE_MASK))
-		vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
-	else
-		vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr);
-}
-
-static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr;
-}
-
-static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr;
-}
-
-static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0;
-}
-
-static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
-	u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr;
-	u32 ret = 0;
-
-	if (misr & ICH_MISR_EOI)
-		ret |= INT_STATUS_EOI;
-	if (misr & ICH_MISR_U)
-		ret |= INT_STATUS_UNDERFLOW;
-
-	return ret;
-}
-
-static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
-
-	vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
-	vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-	vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-	vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-}
-
-static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE;
-}
-
-static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE;
-}
-
-static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	u32 vmcr;
-
-	vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
-	vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
-	vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
-	vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
-}
-
-static void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	/*
-	 * By forcing VMCR to zero, the GIC will restore the binary
-	 * points to their reset values. Anything else resets to zero
-	 * anyway.
-	 */
-	vgic_v3->vgic_vmcr = 0;
-	vgic_v3->vgic_elrsr = ~0;
-
-	/*
-	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
-	 * way, so we force SRE to 1 to demonstrate this to the guest.
-	 * This goes with the spec allowing the value to be RAO/WI.
-	 */
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-		vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
-	else
-		vgic_v3->vgic_sre = 0;
-
-	/* Get the show on the road... */
-	vgic_v3->vgic_hcr = ICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v3_ops = {
-	.get_lr			= vgic_v3_get_lr,
-	.set_lr			= vgic_v3_set_lr,
-	.get_elrsr		= vgic_v3_get_elrsr,
-	.get_eisr		= vgic_v3_get_eisr,
-	.clear_eisr		= vgic_v3_clear_eisr,
-	.get_interrupt_status	= vgic_v3_get_interrupt_status,
-	.enable_underflow	= vgic_v3_enable_underflow,
-	.disable_underflow	= vgic_v3_disable_underflow,
-	.get_vmcr		= vgic_v3_get_vmcr,
-	.set_vmcr		= vgic_v3_set_vmcr,
-	.enable			= vgic_v3_enable,
-};
-
-static struct vgic_params vgic_v3_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
-	kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller
- * @gic_kvm_info:	pointer to the GIC description
- * @ops:		address of a pointer to the GICv3 operations
- * @params:		address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv3 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
-		  const struct vgic_ops **ops,
-		  const struct vgic_params **params)
-{
-	int ret = 0;
-	struct vgic_params *vgic = &vgic_v3_params;
-	const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
-	vgic->maint_irq = gic_kvm_info->maint_irq;
-
-	ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
-
-	/*
-	 * The ListRegs field is 5 bits, but there is a architectural
-	 * maximum of 16 list registers. Just ignore bit 4...
-	 */
-	vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
-	vgic->can_emulate_gicv2 = false;
-
-	if (!vcpu_res->start) {
-		kvm_info("GICv3: no GICV resource entry\n");
-		vgic->vcpu_base = 0;
-	} else if (!PAGE_ALIGNED(vcpu_res->start)) {
-		pr_warn("GICV physical address 0x%llx not page aligned\n",
-			(unsigned long long)vcpu_res->start);
-		vgic->vcpu_base = 0;
-	} else if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
-		pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
-			(unsigned long long)resource_size(vcpu_res),
-			PAGE_SIZE);
-	} else {
-		vgic->vcpu_base = vcpu_res->start;
-		vgic->can_emulate_gicv2 = true;
-		kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-					KVM_DEV_TYPE_ARM_VGIC_V2);
-	}
-	if (vgic->vcpu_base == 0)
-		kvm_info("disabling GICv2 emulation\n");
-	kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
-
-	vgic->vctrl_base = NULL;
-	vgic->type = VGIC_V3;
-	vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
-	kvm_info("GICV base=0x%llx, IRQ=%d\n",
-		 vgic->vcpu_base, vgic->maint_irq);
-
-	on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
-	*ops = &vgic_v3_ops;
-	*params = vgic;
-
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
deleted file mode 100644
index c3bfbb981e73bf..00000000000000
--- a/virt/kvm/arm/vgic.c
+++ /dev/null
@@ -1,2440 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <trace/events/kvm.h>
-#include <asm/kvm.h>
-#include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-/*
- * How the whole thing works (courtesy of Christoffer Dall):
- *
- * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
- *   something is pending on the CPU interface.
- * - Interrupts that are pending on the distributor are stored on the
- *   vgic.irq_pending vgic bitmap (this bitmap is updated by both user land
- *   ioctls and guest mmio ops, and other in-kernel peripherals such as the
- *   arch. timers).
- * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
- *   recalculated
- * - To calculate the oracle, we need info for each cpu from
- *   compute_pending_for_cpu, which considers:
- *   - PPI: dist->irq_pending & dist->irq_enable
- *   - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
- *   - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn
- *     registers, stored on each vcpu. We only keep one bit of
- *     information per interrupt, making sure that only one vcpu can
- *     accept the interrupt.
- * - If any of the above state changes, we must recalculate the oracle.
- * - The same is true when injecting an interrupt, except that we only
- *   consider a single interrupt at a time. The irq_spi_cpu array
- *   contains the target CPU for each SPI.
- *
- * The handling of level interrupts adds some extra complexity. We
- * need to track when the interrupt has been EOIed, so we can sample
- * the 'line' again. This is achieved as such:
- *
- * - When a level interrupt is moved onto a vcpu, the corresponding
- *   bit in irq_queued is set. As long as this bit is set, the line
- *   will be ignored for further interrupts. The interrupt is injected
- *   into the vcpu with the GICH_LR_EOI bit set (generate a
- *   maintenance interrupt on EOI).
- * - When the interrupt is EOIed, the maintenance interrupt fires,
- *   and clears the corresponding bit in irq_queued. This allows the
- *   interrupt line to be sampled again.
- * - Note that level-triggered interrupts can also be set to pending from
- *   writes to GICD_ISPENDRn and lowering the external input line does not
- *   cause the interrupt to become inactive in such a situation.
- *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
- *   inactive as long as the external input line is held high.
- *
- *
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- *   depend on any sizing information or emulation type. No allocation
- *   is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- *   structures that depend on sizing information (number of CPUs,
- *   number of interrupts). Also initializes the vcpu specific data
- *   structures. Can be executed lazily for GICv2.
- *   [to be renamed to kvm_vgic_init??]
- *
- * CPU Interface:
- *
- * - kvm_vgic_cpu_early_init(): initialization of static data that
- *   doesn't depend on any sizing information or emulation type. No
- *   allocation is allowed there.
- */
-
-#include "vgic.h"
-
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
-static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
-						int virt_irq);
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
-
-static const struct vgic_ops *vgic_ops;
-static const struct vgic_params *vgic;
-
-static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-	vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source);
-}
-
-static bool queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
-	return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq);
-}
-
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
-	return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic);
-}
-
-/*
- * struct vgic_bitmap contains a bitmap made of unsigned longs, but
- * extracts u32s out of them.
- *
- * This does not work on 64-bit BE systems, because the bitmap access
- * will store two consecutive 32-bit words with the higher-addressed
- * register's bits at the lower index and the lower-addressed register's
- * bits at the higher index.
- *
- * Therefore, swizzle the register index when accessing the 32-bit word
- * registers to access the right register's value.
- */
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64
-#define REG_OFFSET_SWIZZLE	1
-#else
-#define REG_OFFSET_SWIZZLE	0
-#endif
-
-static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs)
-{
-	int nr_longs;
-
-	nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-
-	b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL);
-	if (!b->private)
-		return -ENOMEM;
-
-	b->shared = b->private + nr_cpus;
-
-	return 0;
-}
-
-static void vgic_free_bitmap(struct vgic_bitmap *b)
-{
-	kfree(b->private);
-	b->private = NULL;
-	b->shared = NULL;
-}
-
-/*
- * Call this function to convert a u64 value to an unsigned long * bitmask
- * in a way that works on both 32-bit and 64-bit LE and BE platforms.
- *
- * Warning: Calling this function may modify *val.
- */
-static unsigned long *u64_to_bitmask(u64 *val)
-{
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
-	*val = (*val >> 32) | (*val << 32);
-#endif
-	return (unsigned long *)val;
-}
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset)
-{
-	offset >>= 2;
-	if (!offset)
-		return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE;
-	else
-		return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE);
-}
-
-static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
-				   int cpuid, int irq)
-{
-	if (irq < VGIC_NR_PRIVATE_IRQS)
-		return test_bit(irq, x->private + cpuid);
-
-	return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
-}
-
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
-			     int irq, int val)
-{
-	unsigned long *reg;
-
-	if (irq < VGIC_NR_PRIVATE_IRQS) {
-		reg = x->private + cpuid;
-	} else {
-		reg = x->shared;
-		irq -= VGIC_NR_PRIVATE_IRQS;
-	}
-
-	if (val)
-		set_bit(irq, reg);
-	else
-		clear_bit(irq, reg);
-}
-
-static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
-{
-	return x->private + cpuid;
-}
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
-{
-	return x->shared;
-}
-
-static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs)
-{
-	int size;
-
-	size  = nr_cpus * VGIC_NR_PRIVATE_IRQS;
-	size += nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
-	x->private = kzalloc(size, GFP_KERNEL);
-	if (!x->private)
-		return -ENOMEM;
-
-	x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32);
-	return 0;
-}
-
-static void vgic_free_bytemap(struct vgic_bytemap *b)
-{
-	kfree(b->private);
-	b->private = NULL;
-	b->shared = NULL;
-}
-
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
-{
-	u32 *reg;
-
-	if (offset < VGIC_NR_PRIVATE_IRQS) {
-		reg = x->private;
-		offset += cpuid * VGIC_NR_PRIVATE_IRQS;
-	} else {
-		reg = x->shared;
-		offset -= VGIC_NR_PRIVATE_IRQS;
-	}
-
-	return reg + (offset / sizeof(u32));
-}
-
-#define VGIC_CFG_LEVEL	0
-#define VGIC_CFG_EDGE	1
-
-static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	int irq_val;
-
-	irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq);
-	return irq_val == VGIC_CFG_EDGE;
-}
-
-static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
-}
-
-static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
-	if (!vgic_dist_irq_get_level(vcpu, irq)) {
-		vgic_dist_irq_clear_pending(vcpu, irq);
-		if (!compute_pending_for_cpu(vcpu))
-			clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-	}
-}
-
-static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
-}
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
-}
-
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
-{
-	if (irq < VGIC_NR_PRIVATE_IRQS)
-		set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
-	else
-		set_bit(irq - VGIC_NR_PRIVATE_IRQS,
-			vcpu->arch.vgic_cpu.pending_shared);
-}
-
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
-{
-	if (irq < VGIC_NR_PRIVATE_IRQS)
-		clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
-	else
-		clear_bit(irq - VGIC_NR_PRIVATE_IRQS,
-			  vcpu->arch.vgic_cpu.pending_shared);
-}
-
-static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
-{
-	return !vgic_irq_is_queued(vcpu, irq);
-}
-
-/**
- * vgic_reg_access - access vgic register
- * @mmio:   pointer to the data describing the mmio access
- * @reg:    pointer to the virtual backing of vgic distributor data
- * @offset: least significant 2 bits used for word offset
- * @mode:   ACCESS_ mode (see defines above)
- *
- * Helper to make vgic register access easier using one of the access
- * modes defined for vgic register access
- * (read,raz,write-ignored,setbit,clearbit,write)
- */
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
-		     phys_addr_t offset, int mode)
-{
-	int word_offset = (offset & 3) * 8;
-	u32 mask = (1UL << (mmio->len * 8)) - 1;
-	u32 regval;
-
-	/*
-	 * Any alignment fault should have been delivered to the guest
-	 * directly (ARM ARM B3.12.7 "Prioritization of aborts").
-	 */
-
-	if (reg) {
-		regval = *reg;
-	} else {
-		BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED));
-		regval = 0;
-	}
-
-	if (mmio->is_write) {
-		u32 data = mmio_data_read(mmio, mask) << word_offset;
-		switch (ACCESS_WRITE_MASK(mode)) {
-		case ACCESS_WRITE_IGNORED:
-			return;
-
-		case ACCESS_WRITE_SETBIT:
-			regval |= data;
-			break;
-
-		case ACCESS_WRITE_CLEARBIT:
-			regval &= ~data;
-			break;
-
-		case ACCESS_WRITE_VALUE:
-			regval = (regval & ~(mask << word_offset)) | data;
-			break;
-		}
-		*reg = regval;
-	} else {
-		switch (ACCESS_READ_MASK(mode)) {
-		case ACCESS_READ_RAZ:
-			regval = 0;
-			/* fall through */
-
-		case ACCESS_READ_VALUE:
-			mmio_data_write(mmio, mask, regval >> word_offset);
-		}
-	}
-}
-
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-			phys_addr_t offset)
-{
-	vgic_reg_access(mmio, NULL, offset,
-			ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-	return false;
-}
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-			    phys_addr_t offset, int vcpu_id, int access)
-{
-	u32 *reg;
-	int mode = ACCESS_READ_VALUE | access;
-	struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id);
-
-	reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset);
-	vgic_reg_access(mmio, reg, offset, mode);
-	if (mmio->is_write) {
-		if (access & ACCESS_WRITE_CLEARBIT) {
-			if (offset < 4) /* Force SGI enabled */
-				*reg |= 0xffff;
-			vgic_retire_disabled_irqs(target_vcpu);
-		}
-		vgic_update_state(kvm);
-		return true;
-	}
-
-	return false;
-}
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm,
-				 struct kvm_exit_mmio *mmio,
-				 phys_addr_t offset, int vcpu_id)
-{
-	u32 *reg, orig;
-	u32 level_mask;
-	int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset);
-	level_mask = (~(*reg));
-
-	/* Mark both level and edge triggered irqs as pending */
-	reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-	orig = *reg;
-	vgic_reg_access(mmio, reg, offset, mode);
-
-	if (mmio->is_write) {
-		/* Set the soft-pending flag only for level-triggered irqs */
-		reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
-					  vcpu_id, offset);
-		vgic_reg_access(mmio, reg, offset, mode);
-		*reg &= level_mask;
-
-		/* Ignore writes to SGIs */
-		if (offset < 2) {
-			*reg &= ~0xffff;
-			*reg |= orig & 0xffff;
-		}
-
-		vgic_update_state(kvm);
-		return true;
-	}
-
-	return false;
-}
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm,
-				   struct kvm_exit_mmio *mmio,
-				   phys_addr_t offset, int vcpu_id)
-{
-	u32 *level_active;
-	u32 *reg, orig;
-	int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-	orig = *reg;
-	vgic_reg_access(mmio, reg, offset, mode);
-	if (mmio->is_write) {
-		/* Re-set level triggered level-active interrupts */
-		level_active = vgic_bitmap_get_reg(&dist->irq_level,
-					  vcpu_id, offset);
-		reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
-		*reg |= *level_active;
-
-		/* Ignore writes to SGIs */
-		if (offset < 2) {
-			*reg &= ~0xffff;
-			*reg |= orig & 0xffff;
-		}
-
-		/* Clear soft-pending flags */
-		reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
-					  vcpu_id, offset);
-		vgic_reg_access(mmio, reg, offset, mode);
-
-		vgic_update_state(kvm);
-		return true;
-	}
-	return false;
-}
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
-				struct kvm_exit_mmio *mmio,
-				phys_addr_t offset, int vcpu_id)
-{
-	u32 *reg;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
-	vgic_reg_access(mmio, reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
-
-	if (mmio->is_write) {
-		vgic_update_state(kvm);
-		return true;
-	}
-
-	return false;
-}
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
-				  struct kvm_exit_mmio *mmio,
-				  phys_addr_t offset, int vcpu_id)
-{
-	u32 *reg;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
-	vgic_reg_access(mmio, reg, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
-
-	if (mmio->is_write) {
-		vgic_update_state(kvm);
-		return true;
-	}
-
-	return false;
-}
-
-static u32 vgic_cfg_expand(u16 val)
-{
-	u32 res = 0;
-	int i;
-
-	/*
-	 * Turn a 16bit value like abcd...mnop into a 32bit word
-	 * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is.
-	 */
-	for (i = 0; i < 16; i++)
-		res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1);
-
-	return res;
-}
-
-static u16 vgic_cfg_compress(u32 val)
-{
-	u16 res = 0;
-	int i;
-
-	/*
-	 * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like
-	 * abcd...mnop which is what we really care about.
-	 */
-	for (i = 0; i < 16; i++)
-		res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i;
-
-	return res;
-}
-
-/*
- * The distributor uses 2 bits per IRQ for the CFG register, but the
- * LSB is always 0. As such, we only keep the upper bit, and use the
- * two above functions to compress/expand the bits
- */
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
-			 phys_addr_t offset)
-{
-	u32 val;
-
-	if (offset & 4)
-		val = *reg >> 16;
-	else
-		val = *reg & 0xffff;
-
-	val = vgic_cfg_expand(val);
-	vgic_reg_access(mmio, &val, offset,
-			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-	if (mmio->is_write) {
-		/* Ignore writes to read-only SGI and PPI bits */
-		if (offset < 8)
-			return false;
-
-		val = vgic_cfg_compress(val);
-		if (offset & 4) {
-			*reg &= 0xffff;
-			*reg |= val << 16;
-		} else {
-			*reg &= 0xffff << 16;
-			*reg |= val;
-		}
-	}
-
-	return false;
-}
-
-/**
- * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
- * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
- *
- * Move any IRQs that have already been assigned to LRs back to the
- * emulated distributor state so that the complete emulated state can be read
- * from the main emulation structures without investigating the LRs.
- */
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
-{
-	u64 elrsr = vgic_get_elrsr(vcpu);
-	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-	int i;
-
-	for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) {
-		struct vgic_lr lr = vgic_get_lr(vcpu, i);
-
-		/*
-		 * There are three options for the state bits:
-		 *
-		 * 01: pending
-		 * 10: active
-		 * 11: pending and active
-		 */
-		BUG_ON(!(lr.state & LR_STATE_MASK));
-
-		/* Reestablish SGI source for pending and active IRQs */
-		if (lr.irq < VGIC_NR_SGIS)
-			add_sgi_source(vcpu, lr.irq, lr.source);
-
-		/*
-		 * If the LR holds an active (10) or a pending and active (11)
-		 * interrupt then move the active state to the
-		 * distributor tracking bit.
-		 */
-		if (lr.state & LR_STATE_ACTIVE)
-			vgic_irq_set_active(vcpu, lr.irq);
-
-		/*
-		 * Reestablish the pending state on the distributor and the
-		 * CPU interface and mark the LR as free for other use.
-		 */
-		vgic_retire_lr(i, vcpu);
-
-		/* Finally update the VGIC state. */
-		vgic_update_state(vcpu->kvm);
-	}
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
-				      int len, gpa_t offset)
-{
-	while (ranges->len) {
-		if (offset >= ranges->base &&
-		    (offset + len) <= (ranges->base + ranges->len))
-			return ranges;
-		ranges++;
-	}
-
-	return NULL;
-}
-
-static bool vgic_validate_access(const struct vgic_dist *dist,
-				 const struct vgic_io_range *range,
-				 unsigned long offset)
-{
-	int irq;
-
-	if (!range->bits_per_irq)
-		return true;	/* Not an irq-based access */
-
-	irq = offset * 8 / range->bits_per_irq;
-	if (irq >= dist->nr_irqs)
-		return false;
-
-	return true;
-}
-
-/*
- * Call the respective handler function for the given range.
- * We split up any 64 bit accesses into two consecutive 32 bit
- * handler calls and merge the result afterwards.
- * We do this in a little endian fashion regardless of the host's
- * or guest's endianness, because the GIC is always LE and the rest of
- * the code (vgic_reg_access) also puts it in a LE fashion already.
- * At this point we have already identified the handle function, so
- * range points to that one entry and offset is relative to this.
- */
-static bool call_range_handler(struct kvm_vcpu *vcpu,
-			       struct kvm_exit_mmio *mmio,
-			       unsigned long offset,
-			       const struct vgic_io_range *range)
-{
-	struct kvm_exit_mmio mmio32;
-	bool ret;
-
-	if (likely(mmio->len <= 4))
-		return range->handle_mmio(vcpu, mmio, offset);
-
-	/*
-	 * Any access bigger than 4 bytes (that we currently handle in KVM)
-	 * is actually 8 bytes long, caused by a 64-bit access
-	 */
-
-	mmio32.len = 4;
-	mmio32.is_write = mmio->is_write;
-	mmio32.private = mmio->private;
-
-	mmio32.phys_addr = mmio->phys_addr + 4;
-	mmio32.data = &((u32 *)mmio->data)[1];
-	ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
-
-	mmio32.phys_addr = mmio->phys_addr;
-	mmio32.data = &((u32 *)mmio->data)[0];
-	ret |= range->handle_mmio(vcpu, &mmio32, offset);
-
-	return ret;
-}
-
-/**
- * vgic_handle_mmio_access - handle an in-kernel MMIO access
- * This is called by the read/write KVM IO device wrappers below.
- * @vcpu:	pointer to the vcpu performing the access
- * @this:	pointer to the KVM IO device in charge
- * @addr:	guest physical address of the access
- * @len:	size of the access
- * @val:	pointer to the data region
- * @is_write:	read or write access
- *
- * returns true if the MMIO access could be performed
- */
-static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
-				   struct kvm_io_device *this, gpa_t addr,
-				   int len, void *val, bool is_write)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	struct vgic_io_device *iodev = container_of(this,
-						    struct vgic_io_device, dev);
-	const struct vgic_io_range *range;
-	struct kvm_exit_mmio mmio;
-	bool updated_state;
-	gpa_t offset;
-
-	offset = addr - iodev->addr;
-	range = vgic_find_range(iodev->reg_ranges, len, offset);
-	if (unlikely(!range || !range->handle_mmio)) {
-		pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len);
-		return -ENXIO;
-	}
-
-	mmio.phys_addr = addr;
-	mmio.len = len;
-	mmio.is_write = is_write;
-	mmio.data = val;
-	mmio.private = iodev->redist_vcpu;
-
-	spin_lock(&dist->lock);
-	offset -= range->base;
-	if (vgic_validate_access(dist, range, offset)) {
-		updated_state = call_range_handler(vcpu, &mmio, offset, range);
-	} else {
-		if (!is_write)
-			memset(val, 0, len);
-		updated_state = false;
-	}
-	spin_unlock(&dist->lock);
-
-	if (updated_state)
-		vgic_kick_vcpus(vcpu->kvm);
-
-	return 0;
-}
-
-static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu,
-				 struct kvm_io_device *this,
-				 gpa_t addr, int len, void *val)
-{
-	return vgic_handle_mmio_access(vcpu, this, addr, len, val, false);
-}
-
-static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu,
-				  struct kvm_io_device *this,
-				  gpa_t addr, int len, const void *val)
-{
-	return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val,
-				       true);
-}
-
-static struct kvm_io_device_ops vgic_io_ops = {
-	.read	= vgic_handle_mmio_read,
-	.write	= vgic_handle_mmio_write,
-};
-
-/**
- * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus
- * @kvm:            The VM structure pointer
- * @base:           The (guest) base address for the register frame
- * @len:            Length of the register frame window
- * @ranges:         Describing the handler functions for each register
- * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call
- * @iodev:          Points to memory to be passed on to the handler
- *
- * @iodev stores the parameters of this function to be usable by the handler
- * respectively the dispatcher function (since the KVM I/O bus framework lacks
- * an opaque parameter). Initialization is done in this function, but the
- * reference should be valid and unique for the whole VGIC lifetime.
- * If the register frame is not mapped for a specific VCPU, pass -1 to
- * @redist_vcpu_id.
- */
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
-			     const struct vgic_io_range *ranges,
-			     int redist_vcpu_id,
-			     struct vgic_io_device *iodev)
-{
-	struct kvm_vcpu *vcpu = NULL;
-	int ret;
-
-	if (redist_vcpu_id >= 0)
-		vcpu = kvm_get_vcpu(kvm, redist_vcpu_id);
-
-	iodev->addr		= base;
-	iodev->len		= len;
-	iodev->reg_ranges	= ranges;
-	iodev->redist_vcpu	= vcpu;
-
-	kvm_iodevice_init(&iodev->dev, &vgic_io_ops);
-
-	mutex_lock(&kvm->slots_lock);
-
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len,
-				      &iodev->dev);
-	mutex_unlock(&kvm->slots_lock);
-
-	/* Mark the iodev as invalid if registration fails. */
-	if (ret)
-		iodev->dev.ops = NULL;
-
-	return ret;
-}
-
-static int vgic_nr_shared_irqs(struct vgic_dist *dist)
-{
-	return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-}
-
-static int compute_active_for_cpu(struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	unsigned long *active, *enabled, *act_percpu, *act_shared;
-	unsigned long active_private, active_shared;
-	int nr_shared = vgic_nr_shared_irqs(dist);
-	int vcpu_id;
-
-	vcpu_id = vcpu->vcpu_id;
-	act_percpu = vcpu->arch.vgic_cpu.active_percpu;
-	act_shared = vcpu->arch.vgic_cpu.active_shared;
-
-	active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id);
-	enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
-	bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS);
-
-	active = vgic_bitmap_get_shared_map(&dist->irq_active);
-	enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-	bitmap_and(act_shared, active, enabled, nr_shared);
-	bitmap_and(act_shared, act_shared,
-		   vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-		   nr_shared);
-
-	active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS);
-	active_shared = find_first_bit(act_shared, nr_shared);
-
-	return (active_private < VGIC_NR_PRIVATE_IRQS ||
-		active_shared < nr_shared);
-}
-
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
-	unsigned long pending_private, pending_shared;
-	int nr_shared = vgic_nr_shared_irqs(dist);
-	int vcpu_id;
-
-	vcpu_id = vcpu->vcpu_id;
-	pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
-	pend_shared = vcpu->arch.vgic_cpu.pending_shared;
-
-	if (!dist->enabled) {
-		bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-		bitmap_zero(pend_shared, nr_shared);
-		return 0;
-	}
-
-	pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
-	enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
-	bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
-
-	pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
-	enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-	bitmap_and(pend_shared, pending, enabled, nr_shared);
-	bitmap_and(pend_shared, pend_shared,
-		   vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-		   nr_shared);
-
-	pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-	pending_shared = find_first_bit(pend_shared, nr_shared);
-	return (pending_private < VGIC_NR_PRIVATE_IRQS ||
-		pending_shared < vgic_nr_shared_irqs(dist));
-}
-
-/*
- * Update the interrupt state and determine which CPUs have pending
- * or active interrupts. Must be called with distributor lock held.
- */
-void vgic_update_state(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int c;
-
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		if (compute_pending_for_cpu(vcpu))
-			set_bit(c, dist->irq_pending_on_cpu);
-
-		if (compute_active_for_cpu(vcpu))
-			set_bit(c, dist->irq_active_on_cpu);
-		else
-			clear_bit(c, dist->irq_active_on_cpu);
-	}
-}
-
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
-	return vgic_ops->get_lr(vcpu, lr);
-}
-
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
-			       struct vgic_lr vlr)
-{
-	vgic_ops->set_lr(vcpu, lr, vlr);
-}
-
-static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
-{
-	return vgic_ops->get_elrsr(vcpu);
-}
-
-static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu)
-{
-	return vgic_ops->get_eisr(vcpu);
-}
-
-static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu)
-{
-	vgic_ops->clear_eisr(vcpu);
-}
-
-static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu)
-{
-	return vgic_ops->get_interrupt_status(vcpu);
-}
-
-static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu)
-{
-	vgic_ops->enable_underflow(vcpu);
-}
-
-static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu)
-{
-	vgic_ops->disable_underflow(vcpu);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	vgic_ops->get_vmcr(vcpu, vmcr);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	vgic_ops->set_vmcr(vcpu, vmcr);
-}
-
-static inline void vgic_enable(struct kvm_vcpu *vcpu)
-{
-	vgic_ops->enable(vcpu);
-}
-
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
-{
-	struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
-
-	vgic_irq_clear_queued(vcpu, vlr.irq);
-
-	/*
-	 * We must transfer the pending state back to the distributor before
-	 * retiring the LR, otherwise we may loose edge-triggered interrupts.
-	 */
-	if (vlr.state & LR_STATE_PENDING) {
-		vgic_dist_irq_set_pending(vcpu, vlr.irq);
-		vlr.hwirq = 0;
-	}
-
-	vlr.state = 0;
-	vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-static bool dist_active_irq(struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
-	int i;
-
-	for (i = 0; i < vgic->nr_lr; i++) {
-		struct vgic_lr vlr = vgic_get_lr(vcpu, i);
-
-		if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE)
-			return true;
-	}
-
-	return vgic_irq_is_active(vcpu, virt_irq);
-}
-
-/*
- * An interrupt may have been disabled after being made pending on the
- * CPU interface (the classic case is a timer running while we're
- * rebooting the guest - the interrupt would kick as soon as the CPU
- * interface gets enabled, with deadly consequences).
- *
- * The solution is to examine already active LRs, and check the
- * interrupt is still enabled. If not, just retire it.
- */
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
-{
-	u64 elrsr = vgic_get_elrsr(vcpu);
-	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-	int lr;
-
-	for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
-		struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-		if (!vgic_irq_is_enabled(vcpu, vlr.irq))
-			vgic_retire_lr(lr, vcpu);
-	}
-}
-
-static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
-				 int lr_nr, struct vgic_lr vlr)
-{
-	if (vgic_irq_is_active(vcpu, irq)) {
-		vlr.state |= LR_STATE_ACTIVE;
-		kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
-		vgic_irq_clear_active(vcpu, irq);
-		vgic_update_state(vcpu->kvm);
-	} else {
-		WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq));
-		vlr.state |= LR_STATE_PENDING;
-		kvm_debug("Set pending: 0x%x\n", vlr.state);
-	}
-
-	if (!vgic_irq_is_edge(vcpu, irq))
-		vlr.state |= LR_EOI_INT;
-
-	if (vlr.irq >= VGIC_NR_SGIS) {
-		struct irq_phys_map *map;
-		map = vgic_irq_map_search(vcpu, irq);
-
-		if (map) {
-			vlr.hwirq = map->phys_irq;
-			vlr.state |= LR_HW;
-			vlr.state &= ~LR_EOI_INT;
-
-			/*
-			 * Make sure we're not going to sample this
-			 * again, as a HW-backed interrupt cannot be
-			 * in the PENDING_ACTIVE stage.
-			 */
-			vgic_irq_set_queued(vcpu, irq);
-		}
-	}
-
-	vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-/*
- * Queue an interrupt to a CPU virtual interface. Return true on success,
- * or false if it wasn't possible to queue it.
- * sgi_source must be zero for any non-SGI interrupts.
- */
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	u64 elrsr = vgic_get_elrsr(vcpu);
-	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
-	struct vgic_lr vlr;
-	int lr;
-
-	/* Sanitize the input... */
-	BUG_ON(sgi_source_id & ~7);
-	BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
-	BUG_ON(irq >= dist->nr_irqs);
-
-	kvm_debug("Queue IRQ%d\n", irq);
-
-	/* Do we have an active interrupt for the same CPUID? */
-	for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
-		vlr = vgic_get_lr(vcpu, lr);
-		if (vlr.irq == irq && vlr.source == sgi_source_id) {
-			kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
-			vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-			return true;
-		}
-	}
-
-	/* Try to use another LR for this interrupt */
-	lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
-	if (lr >= vgic->nr_lr)
-		return false;
-
-	kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
-
-	vlr.irq = irq;
-	vlr.source = sgi_source_id;
-	vlr.state = 0;
-	vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-
-	return true;
-}
-
-static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
-{
-	if (!vgic_can_sample_irq(vcpu, irq))
-		return true; /* level interrupt, already queued */
-
-	if (vgic_queue_irq(vcpu, 0, irq)) {
-		if (vgic_irq_is_edge(vcpu, irq)) {
-			vgic_dist_irq_clear_pending(vcpu, irq);
-			vgic_cpu_irq_clear(vcpu, irq);
-		} else {
-			vgic_irq_set_queued(vcpu, irq);
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Fill the list registers with pending interrupts before running the
- * guest.
- */
-static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	unsigned long *pa_percpu, *pa_shared;
-	int i, vcpu_id;
-	int overflow = 0;
-	int nr_shared = vgic_nr_shared_irqs(dist);
-
-	vcpu_id = vcpu->vcpu_id;
-
-	pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu;
-	pa_shared = vcpu->arch.vgic_cpu.pend_act_shared;
-
-	bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu,
-		  VGIC_NR_PRIVATE_IRQS);
-	bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared,
-		  nr_shared);
-	/*
-	 * We may not have any pending interrupt, or the interrupts
-	 * may have been serviced from another vcpu. In all cases,
-	 * move along.
-	 */
-	if (!kvm_vgic_vcpu_pending_irq(vcpu) && !dist_active_irq(vcpu))
-		goto epilog;
-
-	/* SGIs */
-	for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
-		if (!queue_sgi(vcpu, i))
-			overflow = 1;
-	}
-
-	/* PPIs */
-	for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
-		if (!vgic_queue_hwirq(vcpu, i))
-			overflow = 1;
-	}
-
-	/* SPIs */
-	for_each_set_bit(i, pa_shared, nr_shared) {
-		if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
-			overflow = 1;
-	}
-
-
-
-
-epilog:
-	if (overflow) {
-		vgic_enable_underflow(vcpu);
-	} else {
-		vgic_disable_underflow(vcpu);
-		/*
-		 * We're about to run this VCPU, and we've consumed
-		 * everything the distributor had in store for
-		 * us. Claim we don't have anything pending. We'll
-		 * adjust that if needed while exiting.
-		 */
-		clear_bit(vcpu_id, dist->irq_pending_on_cpu);
-	}
-}
-
-static int process_queued_irq(struct kvm_vcpu *vcpu,
-				   int lr, struct vgic_lr vlr)
-{
-	int pending = 0;
-
-	/*
-	 * If the IRQ was EOIed (called from vgic_process_maintenance) or it
-	 * went from active to non-active (called from vgic_sync_hwirq) it was
-	 * also ACKed and we we therefore assume we can clear the soft pending
-	 * state (should it had been set) for this interrupt.
-	 *
-	 * Note: if the IRQ soft pending state was set after the IRQ was
-	 * acked, it actually shouldn't be cleared, but we have no way of
-	 * knowing that unless we start trapping ACKs when the soft-pending
-	 * state is set.
-	 */
-	vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
-
-	/*
-	 * Tell the gic to start sampling this interrupt again.
-	 */
-	vgic_irq_clear_queued(vcpu, vlr.irq);
-
-	/* Any additional pending interrupt? */
-	if (vgic_irq_is_edge(vcpu, vlr.irq)) {
-		BUG_ON(!(vlr.state & LR_HW));
-		pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
-	} else {
-		if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
-			vgic_cpu_irq_set(vcpu, vlr.irq);
-			pending = 1;
-		} else {
-			vgic_dist_irq_clear_pending(vcpu, vlr.irq);
-			vgic_cpu_irq_clear(vcpu, vlr.irq);
-		}
-	}
-
-	/*
-	 * Despite being EOIed, the LR may not have
-	 * been marked as empty.
-	 */
-	vlr.state = 0;
-	vlr.hwirq = 0;
-	vgic_set_lr(vcpu, lr, vlr);
-
-	return pending;
-}
-
-static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
-{
-	u32 status = vgic_get_interrupt_status(vcpu);
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	struct kvm *kvm = vcpu->kvm;
-	int level_pending = 0;
-
-	kvm_debug("STATUS = %08x\n", status);
-
-	if (status & INT_STATUS_EOI) {
-		/*
-		 * Some level interrupts have been EOIed. Clear their
-		 * active bit.
-		 */
-		u64 eisr = vgic_get_eisr(vcpu);
-		unsigned long *eisr_ptr = u64_to_bitmask(&eisr);
-		int lr;
-
-		for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
-			struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-			WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
-			WARN_ON(vlr.state & LR_STATE_MASK);
-
-
-			/*
-			 * kvm_notify_acked_irq calls kvm_set_irq()
-			 * to reset the IRQ level, which grabs the dist->lock
-			 * so we call this before taking the dist->lock.
-			 */
-			kvm_notify_acked_irq(kvm, 0,
-					     vlr.irq - VGIC_NR_PRIVATE_IRQS);
-
-			spin_lock(&dist->lock);
-			level_pending |= process_queued_irq(vcpu, lr, vlr);
-			spin_unlock(&dist->lock);
-		}
-	}
-
-	if (status & INT_STATUS_UNDERFLOW)
-		vgic_disable_underflow(vcpu);
-
-	/*
-	 * In the next iterations of the vcpu loop, if we sync the vgic state
-	 * after flushing it, but before entering the guest (this happens for
-	 * pending signals and vmid rollovers), then make sure we don't pick
-	 * up any old maintenance interrupts here.
-	 */
-	vgic_clear_eisr(vcpu);
-
-	return level_pending;
-}
-
-/*
- * Save the physical active state, and reset it to inactive.
- *
- * Return true if there's a pending forwarded interrupt to queue.
- */
-static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	bool level_pending;
-
-	if (!(vlr.state & LR_HW))
-		return false;
-
-	if (vlr.state & LR_STATE_ACTIVE)
-		return false;
-
-	spin_lock(&dist->lock);
-	level_pending = process_queued_irq(vcpu, lr, vlr);
-	spin_unlock(&dist->lock);
-	return level_pending;
-}
-
-/* Sync back the VGIC state after a guest run */
-static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	u64 elrsr;
-	unsigned long *elrsr_ptr;
-	int lr, pending;
-	bool level_pending;
-
-	level_pending = vgic_process_maintenance(vcpu);
-
-	/* Deal with HW interrupts, and clear mappings for empty LRs */
-	for (lr = 0; lr < vgic->nr_lr; lr++) {
-		struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
-		level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
-		BUG_ON(vlr.irq >= dist->nr_irqs);
-	}
-
-	/* Check if we still have something up our sleeve... */
-	elrsr = vgic_get_elrsr(vcpu);
-	elrsr_ptr = u64_to_bitmask(&elrsr);
-	pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
-	if (level_pending || pending < vgic->nr_lr)
-		set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
-
-	spin_lock(&dist->lock);
-	__kvm_vgic_flush_hwstate(vcpu);
-	spin_unlock(&dist->lock);
-}
-
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
-
-	__kvm_vgic_sync_hwstate(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return 0;
-
-	return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int c;
-
-	/*
-	 * We've injected an interrupt, time to find out who deserves
-	 * a good kick...
-	 */
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		if (kvm_vgic_vcpu_pending_irq(vcpu))
-			kvm_vcpu_kick(vcpu);
-	}
-}
-
-static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
-{
-	int edge_triggered = vgic_irq_is_edge(vcpu, irq);
-
-	/*
-	 * Only inject an interrupt if:
-	 * - edge triggered and we have a rising edge
-	 * - level triggered and we change level
-	 */
-	if (edge_triggered) {
-		int state = vgic_dist_irq_is_pending(vcpu, irq);
-		return level > state;
-	} else {
-		int state = vgic_dist_irq_get_level(vcpu, irq);
-		return level != state;
-	}
-}
-
-static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-				   unsigned int irq_num, bool level)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int edge_triggered, level_triggered;
-	int enabled;
-	bool ret = true, can_inject = true;
-
-	trace_vgic_update_irq_pending(cpuid, irq_num, level);
-
-	if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
-		return -EINVAL;
-
-	spin_lock(&dist->lock);
-
-	vcpu = kvm_get_vcpu(kvm, cpuid);
-	edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
-	level_triggered = !edge_triggered;
-
-	if (!vgic_validate_injection(vcpu, irq_num, level)) {
-		ret = false;
-		goto out;
-	}
-
-	if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
-		cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
-		if (cpuid == VCPU_NOT_ALLOCATED) {
-			/* Pretend we use CPU0, and prevent injection */
-			cpuid = 0;
-			can_inject = false;
-		}
-		vcpu = kvm_get_vcpu(kvm, cpuid);
-	}
-
-	kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
-
-	if (level) {
-		if (level_triggered)
-			vgic_dist_irq_set_level(vcpu, irq_num);
-		vgic_dist_irq_set_pending(vcpu, irq_num);
-	} else {
-		if (level_triggered) {
-			vgic_dist_irq_clear_level(vcpu, irq_num);
-			if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) {
-				vgic_dist_irq_clear_pending(vcpu, irq_num);
-				vgic_cpu_irq_clear(vcpu, irq_num);
-				if (!compute_pending_for_cpu(vcpu))
-					clear_bit(cpuid, dist->irq_pending_on_cpu);
-			}
-		}
-
-		ret = false;
-		goto out;
-	}
-
-	enabled = vgic_irq_is_enabled(vcpu, irq_num);
-
-	if (!enabled || !can_inject) {
-		ret = false;
-		goto out;
-	}
-
-	if (!vgic_can_sample_irq(vcpu, irq_num)) {
-		/*
-		 * Level interrupt in progress, will be picked up
-		 * when EOId.
-		 */
-		ret = false;
-		goto out;
-	}
-
-	if (level) {
-		vgic_cpu_irq_set(vcpu, irq_num);
-		set_bit(cpuid, dist->irq_pending_on_cpu);
-	}
-
-out:
-	spin_unlock(&dist->lock);
-
-	if (ret) {
-		/* kick the specified vcpu */
-		kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
-	}
-
-	return 0;
-}
-
-static int vgic_lazy_init(struct kvm *kvm)
-{
-	int ret = 0;
-
-	if (unlikely(!vgic_initialized(kvm))) {
-		/*
-		 * We only provide the automatic initialization of the VGIC
-		 * for the legacy case of a GICv2. Any other type must
-		 * be explicitly initialized once setup with the respective
-		 * KVM device call.
-		 */
-		if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
-			return -EBUSY;
-
-		mutex_lock(&kvm->lock);
-		ret = vgic_init(kvm);
-		mutex_unlock(&kvm->lock);
-	}
-
-	return ret;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device. This IRQ
- *           must not be mapped to a HW interrupt.
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *			      false: to ignore the call
- *	     Level-sensitive  true:  raise the input signal
- *			      false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
-			bool level)
-{
-	struct irq_phys_map *map;
-	int ret;
-
-	ret = vgic_lazy_init(kvm);
-	if (ret)
-		return ret;
-
-	map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
-	if (map)
-		return -EINVAL;
-
-	return vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-}
-
-/**
- * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @virt_irq: The virtual IRQ to be injected
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *			      false: to ignore the call
- *	     Level-sensitive  true:  raise the input signal
- *			      false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-			       unsigned int virt_irq, bool level)
-{
-	int ret;
-
-	ret = vgic_lazy_init(kvm);
-	if (ret)
-		return ret;
-
-	return vgic_update_irq_pending(kvm, cpuid, virt_irq, level);
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
-	/*
-	 * We cannot rely on the vgic maintenance interrupt to be
-	 * delivered synchronously. This means we can only use it to
-	 * exit the VM, and we perform the handling of EOIed
-	 * interrupts on the exit path (see vgic_process_maintenance).
-	 */
-	return IRQ_HANDLED;
-}
-
-static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
-						    int virt_irq)
-{
-	if (virt_irq < VGIC_NR_PRIVATE_IRQS)
-		return &vcpu->arch.vgic_cpu.irq_phys_map_list;
-	else
-		return &vcpu->kvm->arch.vgic.irq_phys_map_list;
-}
-
-/**
- * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number for the guest
- * @phys_irq: The hardware IRQ number of the host
- *
- * Establish a mapping between a guest visible irq (@virt_irq) and a
- * hardware irq (@phys_irq). On injection, @virt_irq will be associated with
- * the physical interrupt represented by @phys_irq. This mapping can be
- * established multiple times as long as the parameters are the same.
- *
- * Returns 0 on success or an error value otherwise.
- */
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-	struct irq_phys_map *map;
-	struct irq_phys_map_entry *entry;
-	int ret = 0;
-
-	/* Create a new mapping */
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	spin_lock(&dist->irq_phys_map_lock);
-
-	/* Try to match an existing mapping */
-	map = vgic_irq_map_search(vcpu, virt_irq);
-	if (map) {
-		/* Make sure this mapping matches */
-		if (map->phys_irq != phys_irq)
-			ret = -EINVAL;
-
-		/* Found an existing, valid mapping */
-		goto out;
-	}
-
-	map           = &entry->map;
-	map->virt_irq = virt_irq;
-	map->phys_irq = phys_irq;
-
-	list_add_tail_rcu(&entry->entry, root);
-
-out:
-	spin_unlock(&dist->irq_phys_map_lock);
-	/* If we've found a hit in the existing list, free the useless
-	 * entry */
-	if (ret || map != &entry->map)
-		kfree(entry);
-	return ret;
-}
-
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
-						int virt_irq)
-{
-	struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-	struct irq_phys_map_entry *entry;
-	struct irq_phys_map *map;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(entry, root, entry) {
-		map = &entry->map;
-		if (map->virt_irq == virt_irq) {
-			rcu_read_unlock();
-			return map;
-		}
-	}
-
-	rcu_read_unlock();
-
-	return NULL;
-}
-
-static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
-{
-	struct irq_phys_map_entry *entry;
-
-	entry = container_of(rcu, struct irq_phys_map_entry, rcu);
-	kfree(entry);
-}
-
-/**
- * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number to be unmapped
- *
- * Remove an existing mapping between virtual and physical interrupts.
- */
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	struct irq_phys_map_entry *entry;
-	struct list_head *root;
-
-	root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-
-	spin_lock(&dist->irq_phys_map_lock);
-
-	list_for_each_entry(entry, root, entry) {
-		if (entry->map.virt_irq == virt_irq) {
-			list_del_rcu(&entry->entry);
-			call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
-			break;
-		}
-	}
-
-	spin_unlock(&dist->irq_phys_map_lock);
-
-	return 0;
-}
-
-static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct irq_phys_map_entry *entry;
-
-	spin_lock(&dist->irq_phys_map_lock);
-
-	list_for_each_entry(entry, root, entry) {
-		list_del_rcu(&entry->entry);
-		call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
-	}
-
-	spin_unlock(&dist->irq_phys_map_lock);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	kfree(vgic_cpu->pending_shared);
-	kfree(vgic_cpu->active_shared);
-	kfree(vgic_cpu->pend_act_shared);
-	vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
-	vgic_cpu->pending_shared = NULL;
-	vgic_cpu->active_shared = NULL;
-	vgic_cpu->pend_act_shared = NULL;
-}
-
-static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-	int sz = nr_longs * sizeof(unsigned long);
-	vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
-	vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
-	vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
-
-	if (!vgic_cpu->pending_shared
-		|| !vgic_cpu->active_shared
-		|| !vgic_cpu->pend_act_shared) {
-		kvm_vgic_vcpu_destroy(vcpu);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/**
- * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
-}
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-int kvm_vgic_get_max_vcpus(void)
-{
-	return vgic->max_gic_vcpus;
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_vgic_vcpu_destroy(vcpu);
-
-	vgic_free_bitmap(&dist->irq_enabled);
-	vgic_free_bitmap(&dist->irq_level);
-	vgic_free_bitmap(&dist->irq_pending);
-	vgic_free_bitmap(&dist->irq_soft_pend);
-	vgic_free_bitmap(&dist->irq_queued);
-	vgic_free_bitmap(&dist->irq_cfg);
-	vgic_free_bytemap(&dist->irq_priority);
-	if (dist->irq_spi_target) {
-		for (i = 0; i < dist->nr_cpus; i++)
-			vgic_free_bitmap(&dist->irq_spi_target[i]);
-	}
-	kfree(dist->irq_sgi_sources);
-	kfree(dist->irq_spi_cpu);
-	kfree(dist->irq_spi_mpidr);
-	kfree(dist->irq_spi_target);
-	kfree(dist->irq_pending_on_cpu);
-	kfree(dist->irq_active_on_cpu);
-	vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
-	dist->irq_sgi_sources = NULL;
-	dist->irq_spi_cpu = NULL;
-	dist->irq_spi_target = NULL;
-	dist->irq_pending_on_cpu = NULL;
-	dist->irq_active_on_cpu = NULL;
-	dist->nr_cpus = 0;
-}
-
-/*
- * Allocate and initialize the various data structures. Must be called
- * with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int nr_cpus, nr_irqs;
-	int ret, i, vcpu_id;
-
-	if (vgic_initialized(kvm))
-		return 0;
-
-	nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
-	if (!nr_cpus)		/* No vcpus? Can't be good... */
-		return -ENODEV;
-
-	/*
-	 * If nobody configured the number of interrupts, use the
-	 * legacy one.
-	 */
-	if (!dist->nr_irqs)
-		dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
-
-	nr_irqs = dist->nr_irqs;
-
-	ret  = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
-	ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
-	ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
-	ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
-	ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
-	ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs);
-	ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
-	ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
-
-	if (ret)
-		goto out;
-
-	dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL);
-	dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL);
-	dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus,
-				       GFP_KERNEL);
-	dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
-					   GFP_KERNEL);
-	dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
-					   GFP_KERNEL);
-	if (!dist->irq_sgi_sources ||
-	    !dist->irq_spi_cpu ||
-	    !dist->irq_spi_target ||
-	    !dist->irq_pending_on_cpu ||
-	    !dist->irq_active_on_cpu) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	for (i = 0; i < nr_cpus; i++)
-		ret |= vgic_init_bitmap(&dist->irq_spi_target[i],
-					nr_cpus, nr_irqs);
-
-	if (ret)
-		goto out;
-
-	ret = kvm->arch.vgic.vm_ops.init_model(kvm);
-	if (ret)
-		goto out;
-
-	kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
-		ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
-		if (ret) {
-			kvm_err("VGIC: Failed to allocate vcpu memory\n");
-			break;
-		}
-
-		/*
-		 * Enable and configure all SGIs to be edge-triggere and
-		 * configure all PPIs as level-triggered.
-		 */
-		for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-			if (i < VGIC_NR_SGIS) {
-				/* SGIs */
-				vgic_bitmap_set_irq_val(&dist->irq_enabled,
-							vcpu->vcpu_id, i, 1);
-				vgic_bitmap_set_irq_val(&dist->irq_cfg,
-							vcpu->vcpu_id, i,
-							VGIC_CFG_EDGE);
-			} else if (i < VGIC_NR_PRIVATE_IRQS) {
-				/* PPIs */
-				vgic_bitmap_set_irq_val(&dist->irq_cfg,
-							vcpu->vcpu_id, i,
-							VGIC_CFG_LEVEL);
-			}
-		}
-
-		vgic_enable(vcpu);
-	}
-
-out:
-	if (ret)
-		kvm_vgic_destroy(kvm);
-
-	return ret;
-}
-
-static int init_vgic_model(struct kvm *kvm, int type)
-{
-	switch (type) {
-	case KVM_DEV_TYPE_ARM_VGIC_V2:
-		vgic_v2_init_emulation(kvm);
-		break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-	case KVM_DEV_TYPE_ARM_VGIC_V3:
-		vgic_v3_init_emulation(kvm);
-		break;
-#endif
-	default:
-		return -ENODEV;
-	}
-
-	if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus)
-		return -E2BIG;
-
-	return 0;
-}
-
-/**
- * kvm_vgic_early_init - Earliest possible vgic initialization stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
-	spin_lock_init(&kvm->arch.vgic.lock);
-	spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
-	INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
-}
-
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
-	int i, vcpu_lock_idx = -1, ret;
-	struct kvm_vcpu *vcpu;
-
-	mutex_lock(&kvm->lock);
-
-	if (irqchip_in_kernel(kvm)) {
-		ret = -EEXIST;
-		goto out;
-	}
-
-	/*
-	 * This function is also called by the KVM_CREATE_IRQCHIP handler,
-	 * which had no chance yet to check the availability of the GICv2
-	 * emulation. So check this here again. KVM_CREATE_DEVICE does
-	 * the proper checks already.
-	 */
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	/*
-	 * Any time a vcpu is run, vcpu_load is called which tries to grab the
-	 * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
-	 * that no other VCPUs are run while we create the vgic.
-	 */
-	ret = -EBUSY;
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (!mutex_trylock(&vcpu->mutex))
-			goto out_unlock;
-		vcpu_lock_idx = i;
-	}
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (vcpu->arch.has_run_once)
-			goto out_unlock;
-	}
-	ret = 0;
-
-	ret = init_vgic_model(kvm, type);
-	if (ret)
-		goto out_unlock;
-
-	kvm->arch.vgic.in_kernel = true;
-	kvm->arch.vgic.vgic_model = type;
-	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
-	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
-	kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
-	kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
-
-out_unlock:
-	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-		vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
-		mutex_unlock(&vcpu->mutex);
-	}
-
-out:
-	mutex_unlock(&kvm->lock);
-	return ret;
-}
-
-static int vgic_ioaddr_overlap(struct kvm *kvm)
-{
-	phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
-	phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
-
-	if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu))
-		return 0;
-	if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) ||
-	    (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist))
-		return -EBUSY;
-	return 0;
-}
-
-static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
-			      phys_addr_t addr, phys_addr_t size)
-{
-	int ret;
-
-	if (addr & ~KVM_PHYS_MASK)
-		return -E2BIG;
-
-	if (addr & (SZ_4K - 1))
-		return -EINVAL;
-
-	if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
-		return -EEXIST;
-	if (addr + size < addr)
-		return -EINVAL;
-
-	*ioaddr = addr;
-	ret = vgic_ioaddr_overlap(kvm);
-	if (ret)
-		*ioaddr = VGIC_ADDR_UNDEF;
-
-	return ret;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr:  pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- *          address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space.  These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
-	int r = 0;
-	struct vgic_dist *vgic = &kvm->arch.vgic;
-	int type_needed;
-	phys_addr_t *addr_ptr, block_size;
-	phys_addr_t alignment;
-
-	mutex_lock(&kvm->lock);
-	switch (type) {
-	case KVM_VGIC_V2_ADDR_TYPE_DIST:
-		type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-		addr_ptr = &vgic->vgic_dist_base;
-		block_size = KVM_VGIC_V2_DIST_SIZE;
-		alignment = SZ_4K;
-		break;
-	case KVM_VGIC_V2_ADDR_TYPE_CPU:
-		type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
-		addr_ptr = &vgic->vgic_cpu_base;
-		block_size = KVM_VGIC_V2_CPU_SIZE;
-		alignment = SZ_4K;
-		break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-	case KVM_VGIC_V3_ADDR_TYPE_DIST:
-		type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
-		addr_ptr = &vgic->vgic_dist_base;
-		block_size = KVM_VGIC_V3_DIST_SIZE;
-		alignment = SZ_64K;
-		break;
-	case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-		type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
-		addr_ptr = &vgic->vgic_redist_base;
-		block_size = KVM_VGIC_V3_REDIST_SIZE;
-		alignment = SZ_64K;
-		break;
-#endif
-	default:
-		r = -ENODEV;
-		goto out;
-	}
-
-	if (vgic->vgic_model != type_needed) {
-		r = -ENODEV;
-		goto out;
-	}
-
-	if (write) {
-		if (!IS_ALIGNED(*addr, alignment))
-			r = -EINVAL;
-		else
-			r = vgic_ioaddr_assign(kvm, addr_ptr, *addr,
-					       block_size);
-	} else {
-		*addr = *addr_ptr;
-	}
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-	int r;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		if (copy_from_user(&addr, uaddr, sizeof(addr)))
-			return -EFAULT;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, true);
-		return (r == -ENODEV) ? -ENXIO : r;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 val;
-		int ret = 0;
-
-		if (get_user(val, uaddr))
-			return -EFAULT;
-
-		/*
-		 * We require:
-		 * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
-		 * - at most 1024 interrupts
-		 * - a multiple of 32 interrupts
-		 */
-		if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
-		    val > VGIC_MAX_IRQS ||
-		    (val & 31))
-			return -EINVAL;
-
-		mutex_lock(&dev->kvm->lock);
-
-		if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
-			ret = -EBUSY;
-		else
-			dev->kvm->arch.vgic.nr_irqs = val;
-
-		mutex_unlock(&dev->kvm->lock);
-
-		return ret;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			r = vgic_init(dev->kvm);
-			return r;
-		}
-		break;
-	}
-	}
-
-	return -ENXIO;
-}
-
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-	int r = -ENXIO;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, false);
-		if (r)
-			return (r == -ENODEV) ? -ENXIO : r;
-
-		if (copy_to_user(uaddr, &addr, sizeof(addr)))
-			return -EFAULT;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
-		r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
-		break;
-	}
-
-	}
-
-	return r;
-}
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset)
-{
-	if (vgic_find_range(ranges, 4, offset))
-		return 0;
-	else
-		return -ENXIO;
-}
-
-static void vgic_init_maintenance_interrupt(void *info)
-{
-	enable_percpu_irq(vgic->maint_irq, 0);
-}
-
-static int vgic_cpu_notify(struct notifier_block *self,
-			   unsigned long action, void *cpu)
-{
-	switch (action) {
-	case CPU_STARTING:
-	case CPU_STARTING_FROZEN:
-		vgic_init_maintenance_interrupt(NULL);
-		break;
-	case CPU_DYING:
-	case CPU_DYING_FROZEN:
-		disable_percpu_irq(vgic->maint_irq);
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block vgic_cpu_nb = {
-	.notifier_call = vgic_cpu_notify,
-};
-
-static int kvm_vgic_probe(void)
-{
-	const struct gic_kvm_info *gic_kvm_info;
-	int ret;
-
-	gic_kvm_info = gic_get_kvm_info();
-	if (!gic_kvm_info)
-		return -ENODEV;
-
-	switch (gic_kvm_info->type) {
-	case GIC_V2:
-		ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic);
-		break;
-	case GIC_V3:
-		ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic);
-		break;
-	default:
-		ret = -ENODEV;
-	}
-
-	return ret;
-}
-
-int kvm_vgic_hyp_init(void)
-{
-	int ret;
-
-	ret = kvm_vgic_probe();
-	if (ret) {
-		kvm_err("error: KVM vGIC probing failed\n");
-		return ret;
-	}
-
-	ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
-				 "vgic", kvm_get_running_vcpus());
-	if (ret) {
-		kvm_err("Cannot register interrupt %d\n", vgic->maint_irq);
-		return ret;
-	}
-
-	ret = __register_cpu_notifier(&vgic_cpu_nb);
-	if (ret) {
-		kvm_err("Cannot register vgic CPU notifier\n");
-		goto out_free_irq;
-	}
-
-	on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
-
-	return 0;
-
-out_free_irq:
-	free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
-	return ret;
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
-		    struct kvm_kernel_irq_routing_entry *entries,
-		    int gsi)
-{
-	return 0;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-	return pin;
-}
-
-int kvm_set_irq(struct kvm *kvm, int irq_source_id,
-		u32 irq, int level, bool line_status)
-{
-	unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	BUG_ON(!vgic_initialized(kvm));
-
-	return kvm_vgic_inject_irq(kvm, 0, spi, level);
-}
-
-/* MSI not implemented yet */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-		struct kvm *kvm, int irq_source_id,
-		int level, bool line_status)
-{
-	return 0;
-}
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
deleted file mode 100644
index 0df74cbb620068..00000000000000
--- a/virt/kvm/arm/vgic.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2012-2014 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * Derived from virt/kvm/arm/vgic.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __KVM_VGIC_H__
-#define __KVM_VGIC_H__
-
-#include <kvm/iodev.h>
-
-#define VGIC_ADDR_UNDEF		(-1)
-#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
-
-#define PRODUCT_ID_KVM		0x4b	/* ASCII code K */
-#define IMPLEMENTER_ARM		0x43b
-
-#define ACCESS_READ_VALUE	(1 << 0)
-#define ACCESS_READ_RAZ		(0 << 0)
-#define ACCESS_READ_MASK(x)	((x) & (1 << 0))
-#define ACCESS_WRITE_IGNORED	(0 << 1)
-#define ACCESS_WRITE_SETBIT	(1 << 1)
-#define ACCESS_WRITE_CLEARBIT	(2 << 1)
-#define ACCESS_WRITE_VALUE	(3 << 1)
-#define ACCESS_WRITE_MASK(x)	((x) & (3 << 1))
-
-#define VCPU_NOT_ALLOCATED	((u8)-1)
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
-
-void vgic_update_state(struct kvm *kvm);
-int vgic_init_common_maps(struct kvm *kvm);
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset);
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset);
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq);
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
-			     int irq, int val);
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
-
-struct kvm_exit_mmio {
-	phys_addr_t	phys_addr;
-	void		*data;
-	u32		len;
-	bool		is_write;
-	void		*private;
-};
-
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
-		     phys_addr_t offset, int mode);
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-			phys_addr_t offset);
-
-static inline
-u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
-{
-	return le32_to_cpu(*((u32 *)mmio->data)) & mask;
-}
-
-static inline
-void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
-{
-	*((u32 *)mmio->data) = cpu_to_le32(value) & mask;
-}
-
-struct vgic_io_range {
-	phys_addr_t base;
-	unsigned long len;
-	int bits_per_irq;
-	bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
-			    phys_addr_t offset);
-};
-
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
-			     const struct vgic_io_range *ranges,
-			     int redist_id,
-			     struct vgic_io_device *iodev);
-
-static inline bool is_in_range(phys_addr_t addr, unsigned long len,
-			       phys_addr_t baseaddr, unsigned long size)
-{
-	return (addr >= baseaddr) && (addr + len <= baseaddr + size);
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
-				      int len, gpa_t offset);
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-			    phys_addr_t offset, int vcpu_id, int access);
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-				 phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
-				   phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
-				struct kvm_exit_mmio *mmio,
-				phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
-				  struct kvm_exit_mmio *mmio,
-				  phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
-			 phys_addr_t offset);
-
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset);
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-
-int vgic_init(struct kvm *kvm);
-void vgic_v2_init_emulation(struct kvm *kvm);
-void vgic_v3_init_emulation(struct kvm *kvm);
-
-#endif

From 82a81bff90c5fd11fefae35773f7396617a3cfff Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:34 +0100
Subject: [PATCH 185/302] arm64: KVM: Merged page tables documentation

Since dealing with VA ranges tends to hurt my brain badly, let's
start with a bit of documentation that will hopefully help
understanding what comes next...

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_mmu.h | 40 +++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index fdfbddbe9fbac5..6149dfc2c01224 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -29,10 +29,44 @@
  *
  * Instead, give the HYP mode its own VA region at a fixed offset from
  * the kernel by just masking the top bits (which are all ones for a
- * kernel address).
+ * kernel address). We need to find out how many bits to mask.
  *
- * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these
- * macros (the entire kernel runs at EL2).
+ * We want to build a set of page tables that cover both parts of the
+ * idmap (the trampoline page used to initialize EL2), and our normal
+ * runtime VA space, at the same time.
+ *
+ * Given that the kernel uses VA_BITS for its entire address space,
+ * and that half of that space (VA_BITS - 1) is used for the linear
+ * mapping, we can also limit the EL2 space to (VA_BITS - 1).
+ *
+ * The main question is "Within the VA_BITS space, does EL2 use the
+ * top or the bottom half of that space to shadow the kernel's linear
+ * mapping?". As we need to idmap the trampoline page, this is
+ * determined by the range in which this page lives.
+ *
+ * If the page is in the bottom half, we have to use the top half. If
+ * the page is in the top half, we have to use the bottom half:
+ *
+ * T = __virt_to_phys(__hyp_idmap_text_start)
+ * if (T & BIT(VA_BITS - 1))
+ *	HYP_VA_MIN = 0  //idmap in upper half
+ * else
+ *	HYP_VA_MIN = 1 << (VA_BITS - 1)
+ * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
+ *
+ * This of course assumes that the trampoline page exists within the
+ * VA_BITS range. If it doesn't, then it means we're in the odd case
+ * where the kernel idmap (as well as HYP) uses more levels than the
+ * kernel runtime page tables (as seen when the kernel is configured
+ * for 4k pages, 39bits VA, and yet memory lives just above that
+ * limit, forcing the idmap to use 4 levels of page tables while the
+ * kernel itself only uses 3). In this particular case, it doesn't
+ * matter which side of VA_BITS we use, as we're guaranteed not to
+ * conflict with anything.
+ *
+ * When using VHE, there are no separate hyp mappings and all KVM
+ * functionality is already mapped as part of the main kernel
+ * mappings, and none of this applies in that case.
  */
 #define HYP_PAGE_OFFSET_SHIFT	VA_BITS
 #define HYP_PAGE_OFFSET_MASK	((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)

From cf7df13d3c7c7f8a475c09ef49a5b72f7cfe3f4b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:35 +0100
Subject: [PATCH 186/302] arm64: KVM: Always reference __hyp_panic_string via
 its kernel VA

__hyp_panic_string is passed via the HYP panic code to the panic
function, and is being "upgraded" to a kernel address, as it is
referenced by the HYP code (in a PC-relative way).

This is a bit silly, and we'd be better off obtaining the kernel
address and not mess with it at all. This patch implements this
with a tiny bit of asm glue, by forcing the string pointer to be
read from the literal pool.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/kvm/hyp/switch.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 437cfad5e3d868..81f21a2ab968eb 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -299,9 +299,16 @@ static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%
 
 static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
 {
-	unsigned long str_va = (unsigned long)__hyp_panic_string;
+	unsigned long str_va;
 
-	__hyp_do_panic(hyp_kern_va(str_va),
+	/*
+	 * Force the panic string to be loaded from the literal pool,
+	 * making sure it is a kernel address and not a PC-relative
+	 * reference.
+	 */
+	asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
+
+	__hyp_do_panic(str_va,
 		       spsr,  elr,
 		       read_sysreg(esr_el2),   read_sysreg_el2(far),
 		       read_sysreg(hpfar_el2), par,

From 3f0f8830d440e3edf5580424519a7c3434891c64 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:36 +0100
Subject: [PATCH 187/302] arm/arm64: KVM: Remove hyp_kern_va helper

hyp_kern_va is now completely unused, so let's remove it entirely.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_hyp.h   |  1 -
 arch/arm64/include/asm/kvm_hyp.h | 12 ------------
 2 files changed, 13 deletions(-)

diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index f0e86076138077..e38fce2270ac9c 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -26,7 +26,6 @@
 #define __hyp_text __section(.hyp.text) notrace
 
 #define kern_hyp_va(v) (v)
-#define hyp_kern_va(v) (v)
 
 #define __ACCESS_CP15(CRn, Op1, CRm, Op2)	\
 	"mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 44eaff70da6ae0..1d81f9abd172c1 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -36,18 +36,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
 
 #define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
 
-static inline unsigned long __hyp_kern_va(unsigned long v)
-{
-	u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET;
-	asm volatile(ALTERNATIVE("add %0, %0, %1",
-				 "nop",
-				 ARM64_HAS_VIRT_HOST_EXTN)
-		     : "+r" (v) : "r" (offset));
-	return v;
-}
-
-#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v)))
-
 #define read_sysreg_elx(r,nvh,vh)					\
 	({								\
 		u64 reg;						\

From fd16fe6820ede711c6e6950ffebdbc8ade5d05b3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:37 +0100
Subject: [PATCH 188/302] arm64: KVM: Kill HYP_PAGE_OFFSET

HYP_PAGE_OFFSET is not massively useful. And the way we use it
in KERN_HYP_VA is inconsistent with the equivalent operation in
EL2, where we use a mask instead.

Let's replace the uses of HYP_PAGE_OFFSET with HYP_PAGE_OFFSET_MASK,
and get rid of the pointless macro.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_mmu.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 6149dfc2c01224..2f1e1aec5ecb38 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -70,7 +70,6 @@
  */
 #define HYP_PAGE_OFFSET_SHIFT	VA_BITS
 #define HYP_PAGE_OFFSET_MASK	((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
-#define HYP_PAGE_OFFSET		(PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
 
 /*
  * Our virtual mapping for the idmap-ed MMU-enable code. Must be
@@ -104,7 +103,7 @@ alternative_endif
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 
-#define KERN_TO_HYP(kva)	((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET)
+#define KERN_TO_HYP(kva)	((unsigned long)kva & HYP_PAGE_OFFSET_MASK)
 
 /*
  * We currently only support a 40bit IPA.

From 853c3b21ff35816a2ae351fd7c2adb101c1f4503 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:38 +0100
Subject: [PATCH 189/302] arm64: Add ARM64_HYP_OFFSET_LOW capability

As we need to indicate to the rest of the kernel which region of
the HYP VA space is safe to use, add a capability that will
indicate that KVM should use the [VA_BITS-2:0] range.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/cpufeature.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 224efe730e4614..d40edbb6ef2300 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -36,8 +36,9 @@
 #define ARM64_HAS_VIRT_HOST_EXTN		11
 #define ARM64_WORKAROUND_CAVIUM_27456		12
 #define ARM64_HAS_32BIT_EL0			13
+#define ARM64_HYP_OFFSET_LOW			14
 
-#define ARM64_NCAPS				14
+#define ARM64_NCAPS				15
 
 #ifndef __ASSEMBLY__
 

From d53d9bc65289dc50b42587313466594e4d611f0f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:39 +0100
Subject: [PATCH 190/302] arm64: KVM: Define HYP offset masks

Define the two possible HYP VA regions in terms of VA_BITS,
and keep HYP_PAGE_OFFSET_MASK as a temporary compatibility
definition.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_mmu.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 2f1e1aec5ecb38..5e543231f61526 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -68,8 +68,12 @@
  * functionality is already mapped as part of the main kernel
  * mappings, and none of this applies in that case.
  */
-#define HYP_PAGE_OFFSET_SHIFT	VA_BITS
-#define HYP_PAGE_OFFSET_MASK	((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
+
+#define HYP_PAGE_OFFSET_HIGH_MASK	((UL(1) << VA_BITS) - 1)
+#define HYP_PAGE_OFFSET_LOW_MASK	((UL(1) << (VA_BITS - 1)) - 1)
+
+/* Temporary compat define */
+#define HYP_PAGE_OFFSET_MASK		HYP_PAGE_OFFSET_HIGH_MASK
 
 /*
  * Our virtual mapping for the idmap-ed MMU-enable code. Must be

From fd81e6bf3928c14f90a033df164c375d4ce0fd85 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:40 +0100
Subject: [PATCH 191/302] arm64: KVM: Refactor kern_hyp_va to deal with
 multiple offsets

As we move towards a selectable HYP VA range, it is obvious that
we don't want to test a variable to find out if we need to use
the bottom VA range, the top VA range, or use the address as is
(for VHE).

Instead, we can expand our current helper to generate the right
mask or nop with code patching. We default to using the top VA
space, with alternatives to switch to the bottom one or to nop
out the instructions.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_hyp.h | 11 ---------
 arch/arm64/include/asm/kvm_mmu.h | 42 +++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 1d81f9abd172c1..cff510574fae03 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -25,17 +25,6 @@
 
 #define __hyp_text __section(.hyp.text) notrace
 
-static inline unsigned long __kern_hyp_va(unsigned long v)
-{
-	asm volatile(ALTERNATIVE("and %0, %0, %1",
-				 "nop",
-				 ARM64_HAS_VIRT_HOST_EXTN)
-		     : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK));
-	return v;
-}
-
-#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
-
 #define read_sysreg_elx(r,nvh,vh)					\
 	({								\
 		u64 reg;						\
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 5e543231f61526..2970537161d227 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -90,13 +90,33 @@
 /*
  * Convert a kernel VA into a HYP VA.
  * reg: VA to be converted.
+ *
+ * This generates the following sequences:
+ * - High mask:
+ *		and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ *		nop
+ * - Low mask:
+ *		and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ *		and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK
+ * - VHE:
+ *		nop
+ *		nop
+ *
+ * The "low mask" version works because the mask is a strict subset of
+ * the "high mask", hence performing the first mask for nothing.
+ * Should be completely invisible on any viable CPU.
  */
 .macro kern_hyp_va	reg
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN	
-	and	\reg, \reg, #HYP_PAGE_OFFSET_MASK
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+	and     \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
 alternative_else
 	nop
 alternative_endif
+alternative_if_not ARM64_HYP_OFFSET_LOW
+	nop
+alternative_else
+	and     \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK
+alternative_endif
 .endm
 
 #else
@@ -107,7 +127,23 @@ alternative_endif
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 
-#define KERN_TO_HYP(kva)	((unsigned long)kva & HYP_PAGE_OFFSET_MASK)
+static inline unsigned long __kern_hyp_va(unsigned long v)
+{
+	asm volatile(ALTERNATIVE("and %0, %0, %1",
+				 "nop",
+				 ARM64_HAS_VIRT_HOST_EXTN)
+		     : "+r" (v)
+		     : "i" (HYP_PAGE_OFFSET_HIGH_MASK));
+	asm volatile(ALTERNATIVE("nop",
+				 "and %0, %0, %1",
+				 ARM64_HYP_OFFSET_LOW)
+		     : "+r" (v)
+		     : "i" (HYP_PAGE_OFFSET_LOW_MASK));
+	return v;
+}
+
+#define kern_hyp_va(v) 	(typeof(v))(__kern_hyp_va((unsigned long)(v)))
+#define KERN_TO_HYP(v)	kern_hyp_va(v)
 
 /*
  * We currently only support a 40bit IPA.

From 1df3e2347a432fec7ec4aea67161986e116f68eb Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:41 +0100
Subject: [PATCH 192/302] arm/arm64: KVM: Export __hyp_text_start/end symbols

Declare the __hyp_text_start/end symbols in asm/virt.h so that
they can be reused without having to declare them locally.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/virt.h   | 4 ++++
 arch/arm/kvm/mmu.c            | 2 --
 arch/arm64/include/asm/virt.h | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index d4ceaf5f299b8d..a2e75b84e2ae6b 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -80,6 +80,10 @@ static inline bool is_kernel_in_hyp_mode(void)
 	return false;
 }
 
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 679608fa1666a7..f004e7017201a1 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -32,8 +32,6 @@
 
 #include "trace.h"
 
-extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
-
 static pgd_t *boot_hyp_pgd;
 static pgd_t *hyp_pgd;
 static pgd_t *merged_hyp_pgd;
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index dcbcf8dcbefbf7..88aa8ec784f6a2 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -82,6 +82,10 @@ extern void verify_cpu_run_el(void);
 static inline void verify_cpu_run_el(void) {}
 #endif
 
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];

From d174591016ce9fcb61182e2a4d0aac951900fc32 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:42 +0100
Subject: [PATCH 193/302] arm64: KVM: Runtime detection of lower HYP offset

Add the code that enables the switch to the lower HYP VA range.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/kernel/cpufeature.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 811773d1c1d015..ffb3e14dda60dd 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -726,6 +726,19 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused
 	return is_kernel_in_hyp_mode();
 }
 
+static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
+			   int __unused)
+{
+	phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+
+	/*
+	 * Activate the lower HYP offset only if:
+	 * - the idmap doesn't clash with it,
+	 * - the kernel is not running at EL2.
+	 */
+	return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -803,6 +816,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.field_pos = ID_AA64PFR0_EL0_SHIFT,
 		.min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
 	},
+	{
+		.desc = "Reduced HYP mapping offset",
+		.capability = ARM64_HYP_OFFSET_LOW,
+		.def_scope = SCOPE_SYSTEM,
+		.matches = hyp_offset_low,
+	},
 	{},
 };
 

From 0535a3e2b2d518a21d93e7cfe07821f1b24ccd0c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:43 +0100
Subject: [PATCH 194/302] arm/arm64: KVM: Always have merged page tables

We're in a position where we can now always have "merged" page
tables, where both the runtime mapping and the idmap coexist.

This results in some code being removed, but there is more to come.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/kvm/mmu.c     | 74 +++++++++++++++++++-----------------------
 arch/arm64/kvm/reset.c | 31 ++++--------------
 2 files changed, 41 insertions(+), 64 deletions(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index f004e7017201a1..80d3737d68d205 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -492,13 +492,12 @@ void free_boot_hyp_pgd(void)
 
 	if (boot_hyp_pgd) {
 		unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-		unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
 		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
 		boot_hyp_pgd = NULL;
 	}
 
 	if (hyp_pgd)
-		unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+		unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
 
 	mutex_unlock(&kvm_hyp_pgd_mutex);
 }
@@ -1691,7 +1690,7 @@ phys_addr_t kvm_mmu_get_boot_httbr(void)
 	if (__kvm_cpu_uses_extended_idmap())
 		return virt_to_phys(merged_hyp_pgd);
 	else
-		return virt_to_phys(boot_hyp_pgd);
+		return virt_to_phys(hyp_pgd);
 }
 
 phys_addr_t kvm_get_idmap_vector(void)
@@ -1704,6 +1703,22 @@ phys_addr_t kvm_get_idmap_start(void)
 	return hyp_idmap_start;
 }
 
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+	int err;
+
+	/* Create the idmap in the boot page tables */
+	err = 	__create_hyp_mappings(pgd,
+				      hyp_idmap_start, hyp_idmap_end,
+				      __phys_to_pfn(hyp_idmap_start),
+				      PAGE_HYP_EXEC);
+	if (err)
+		kvm_err("Failed to idmap %lx-%lx\n",
+			hyp_idmap_start, hyp_idmap_end);
+
+	return err;
+}
+
 int kvm_mmu_init(void)
 {
 	int err;
@@ -1719,27 +1734,25 @@ int kvm_mmu_init(void)
 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
 	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-	boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-
-	if (!hyp_pgd || !boot_hyp_pgd) {
+	if (!hyp_pgd) {
 		kvm_err("Hyp mode PGD not allocated\n");
 		err = -ENOMEM;
 		goto out;
 	}
 
-	/* Create the idmap in the boot page tables */
-	err = 	__create_hyp_mappings(boot_hyp_pgd,
-				      hyp_idmap_start, hyp_idmap_end,
-				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP_EXEC);
+	if (__kvm_cpu_uses_extended_idmap()) {
+		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+							 hyp_pgd_order);
+		if (!boot_hyp_pgd) {
+			kvm_err("Hyp boot PGD not allocated\n");
+			err = -ENOMEM;
+			goto out;
+		}
 
-	if (err) {
-		kvm_err("Failed to idmap %lx-%lx\n",
-			hyp_idmap_start, hyp_idmap_end);
-		goto out;
-	}
+		err = kvm_map_idmap_text(boot_hyp_pgd);
+		if (err)
+			goto out;
 
-	if (__kvm_cpu_uses_extended_idmap()) {
 		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 		if (!merged_hyp_pgd) {
 			kvm_err("Failed to allocate extra HYP pgd\n");
@@ -1747,29 +1760,10 @@ int kvm_mmu_init(void)
 		}
 		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
 				    hyp_idmap_start);
-		return 0;
-	}
-
-	/* Map the very same page at the trampoline VA */
-	err = 	__create_hyp_mappings(boot_hyp_pgd,
-				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
-				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP_EXEC);
-	if (err) {
-		kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
-			TRAMPOLINE_VA);
-		goto out;
-	}
-
-	/* Map the same page again into the runtime page tables */
-	err = 	__create_hyp_mappings(hyp_pgd,
-				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
-				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP_EXEC);
-	if (err) {
-		kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
-			TRAMPOLINE_VA);
-		goto out;
+	} else {
+		err = kvm_map_idmap_text(hyp_pgd);
+		if (err)
+			goto out;
 	}
 
 	return 0;
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 7be24f2b18dbd7..8ed7e4a92e9556 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -133,30 +133,13 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
 
-extern char __hyp_idmap_text_start[];
-
 unsigned long kvm_hyp_reset_entry(void)
 {
-	if (!__kvm_cpu_uses_extended_idmap()) {
-		unsigned long offset;
-
-		/*
-		 * Find the address of __kvm_hyp_reset() in the trampoline page.
-		 * This is present in the running page tables, and the boot page
-		 * tables, so we call the code here to start the trampoline
-		 * dance in reverse.
-		 */
-		offset = (unsigned long)__kvm_hyp_reset
-			 - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
-
-		return TRAMPOLINE_VA + offset;
-	} else {
-		/*
-		 * KVM is running with merged page tables, which don't have the
-		 * trampoline page mapped. We know the idmap is still mapped,
-		 * but can't be called into directly. Use
-		 * __extended_idmap_trampoline to do the call.
-		 */
-		return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
-	}
+	/*
+	 * KVM is running with merged page tables, which don't have the
+	 * trampoline page mapped. We know the idmap is still mapped,
+	 * but can't be called into directly. Use
+	 * __extended_idmap_trampoline to do the call.
+	 */
+	return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
 }

From 3421e9d88d7ae70fbc8c903e44a5acace8ae2d29 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:44 +0100
Subject: [PATCH 195/302] arm64: KVM: Simplify HYP init/teardown

Now that we only have the "merged page tables" case to deal with,
there is a bunch of things we can simplify in the HYP code (both
at init and teardown time).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_host.h | 12 ++----
 arch/arm64/kvm/hyp-init.S         | 61 ++++---------------------------
 arch/arm64/kvm/hyp/entry.S        | 19 ----------
 arch/arm64/kvm/hyp/hyp-entry.S    | 15 ++++++++
 arch/arm64/kvm/reset.c            | 11 ------
 5 files changed, 26 insertions(+), 92 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 49095fc4b482d3..88462c30751024 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -48,7 +48,6 @@
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_dev_ioctl_check_extension(long ext);
-unsigned long kvm_hyp_reset_entry(void);
 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
 
 struct kvm_arch {
@@ -357,19 +356,14 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
 	 * Call initialization code, and switch to the full blown
 	 * HYP code.
 	 */
-	__kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
-		       hyp_stack_ptr, vector_ptr);
+	__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
 }
 
+void __kvm_hyp_teardown(void);
 static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
 					phys_addr_t phys_idmap_start)
 {
-	/*
-	 * Call reset code, and switch back to stub hyp vectors.
-	 * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
-	 */
-	__kvm_call_hyp((void *)kvm_hyp_reset_entry(),
-		       boot_pgd_ptr, phys_idmap_start);
+	kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
 }
 
 static inline void kvm_arch_hardware_unsetup(void) {}
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index a873a6d8be908d..6b29d3d9e1f285 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -53,10 +53,9 @@ __invalid:
 	b	.
 
 	/*
-	 * x0: HYP boot pgd
-	 * x1: HYP pgd
-	 * x2: HYP stack
-	 * x3: HYP vectors
+	 * x0: HYP pgd
+	 * x1: HYP stack
+	 * x2: HYP vectors
 	 */
 __do_hyp_init:
 
@@ -110,71 +109,27 @@ __do_hyp_init:
 	msr	sctlr_el2, x4
 	isb
 
-	/* Skip the trampoline dance if we merged the boot and runtime PGDs */
-	cmp	x0, x1
-	b.eq	merged
-
-	/* MMU is now enabled. Get ready for the trampoline dance */
-	ldr	x4, =TRAMPOLINE_VA
-	adr	x5, target
-	bfi	x4, x5, #0, #PAGE_SHIFT
-	br	x4
-
-target: /* We're now in the trampoline code, switch page tables */
-	msr	ttbr0_el2, x1
-	isb
-
-	/* Invalidate the old TLBs */
-	tlbi	alle2
-	dsb	sy
-
-merged:
 	/* Set the stack and new vectors */
+	kern_hyp_va	x1
+	mov	sp, x1
 	kern_hyp_va	x2
-	mov	sp, x2
-	kern_hyp_va	x3
-	msr	vbar_el2, x3
+	msr	vbar_el2, x2
 
 	/* Hello, World! */
 	eret
 ENDPROC(__kvm_hyp_init)
 
 	/*
-	 * Reset kvm back to the hyp stub. This is the trampoline dance in
-	 * reverse. If kvm used an extended idmap, __extended_idmap_trampoline
-	 * calls this code directly in the idmap. In this case switching to the
-	 * boot tables is a no-op.
-	 *
-	 * x0: HYP boot pgd
-	 * x1: HYP phys_idmap_start
+	 * Reset kvm back to the hyp stub.
 	 */
 ENTRY(__kvm_hyp_reset)
-	/* We're in trampoline code in VA, switch back to boot page tables */
-	msr	ttbr0_el2, x0
-	isb
-
-	/* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
-	ic	iallu
-	tlbi	alle2
-	dsb	sy
-	isb
-
-	/* Branch into PA space */
-	adr	x0, 1f
-	bfi	x1, x0, #0, #PAGE_SHIFT
-	br	x1
-
 	/* We're now in idmap, disable MMU */
-1:	mrs	x0, sctlr_el2
+	mrs	x0, sctlr_el2
 	ldr	x1, =SCTLR_ELx_FLAGS
 	bic	x0, x0, x1		// Clear SCTL_M and etc
 	msr	sctlr_el2, x0
 	isb
 
-	/* Invalidate the old TLBs */
-	tlbi	alle2
-	dsb	sy
-
 	/* Install stub vectors */
 	adr_l	x0, __hyp_stub_vectors
 	msr	vbar_el2, x0
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 70254a65bd5b92..ce9e5e5f28cfb7 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -164,22 +164,3 @@ alternative_endif
 
 	eret
 ENDPROC(__fpsimd_guest_restore)
-
-/*
- * When using the extended idmap, we don't have a trampoline page we can use
- * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap
- * directly would be ideal, but if we're using the extended idmap then the
- * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by
- * kvm_call_hyp using kern_hyp_va.
- *
- * x0: HYP boot pgd
- * x1: HYP phys_idmap_start
- */
-ENTRY(__extended_idmap_trampoline)
-	mov	x4, x1
-	adr_l	x3, __kvm_hyp_reset
-
-	/* insert __kvm_hyp_reset()s offset into phys_idmap_start */
-	bfi	x4, x3, #0, #PAGE_SHIFT
-	br	x4
-ENDPROC(__extended_idmap_trampoline)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 2d87f36d5cb494..f6d9694ae3b13f 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -62,6 +62,21 @@ ENTRY(__vhe_hyp_call)
 	isb
 	ret
 ENDPROC(__vhe_hyp_call)
+
+/*
+ * Compute the idmap address of __kvm_hyp_reset based on the idmap
+ * start passed as a parameter, and jump there.
+ *
+ * x0: HYP phys_idmap_start
+ */
+ENTRY(__kvm_hyp_teardown)
+	mov	x4, x0
+	adr_l	x3, __kvm_hyp_reset
+
+	/* insert __kvm_hyp_reset()s offset into phys_idmap_start */
+	bfi	x4, x3, #0, #PAGE_SHIFT
+	br	x4
+ENDPROC(__kvm_hyp_teardown)
 	
 el1_sync:				// Guest trapped into EL2
 	save_x0_to_x3
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 8ed7e4a92e9556..79f324823340de 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -132,14 +132,3 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	/* Reset timer */
 	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
-
-unsigned long kvm_hyp_reset_entry(void)
-{
-	/*
-	 * KVM is running with merged page tables, which don't have the
-	 * trampoline page mapped. We know the idmap is still mapped,
-	 * but can't be called into directly. Use
-	 * __extended_idmap_trampoline to do the call.
-	 */
-	return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
-}

From 12fda8123d74903d1f65fb006fe4964e23ede0d1 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:45 +0100
Subject: [PATCH 196/302] arm/arm64: KVM: Drop boot_pgd

Since we now only have one set of page tables, the concept of
boot_pgd is useless and can be removed. We still keep it as
an element of the "extended idmap" thing.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_host.h   |  8 +++-----
 arch/arm/include/asm/kvm_mmu.h    |  1 -
 arch/arm/kvm/arm.c                | 15 +++------------
 arch/arm/kvm/mmu.c                |  8 --------
 arch/arm64/include/asm/kvm_host.h |  6 ++----
 arch/arm64/include/asm/kvm_mmu.h  |  1 -
 6 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 96387d477e91c8..020f4eb14f0ba0 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -241,8 +241,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		int exception_index);
 
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
-				       phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 				       unsigned long hyp_stack_ptr,
 				       unsigned long vector_ptr)
 {
@@ -272,12 +271,11 @@ static inline void __cpu_init_stage2(void)
 	kvm_call_hyp(__init_stage2_translation);
 }
 
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
-					phys_addr_t phys_idmap_start)
+static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start)
 {
 	/*
 	 * TODO
-	 * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
+	 * kvm_call_reset(phys_idmap_start);
 	 */
 }
 
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 6cb4d4d5c48c41..5d161d13b4712c 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -65,7 +65,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index c74483fc39f2c3..0887cc12b4019b 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1038,7 +1038,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 static void cpu_init_hyp_mode(void *dummy)
 {
-	phys_addr_t boot_pgd_ptr;
 	phys_addr_t pgd_ptr;
 	unsigned long hyp_stack_ptr;
 	unsigned long stack_page;
@@ -1047,13 +1046,12 @@ static void cpu_init_hyp_mode(void *dummy)
 	/* Switch from the HYP stub to our own HYP init vector */
 	__hyp_set_vectors(kvm_get_idmap_vector());
 
-	boot_pgd_ptr = kvm_mmu_get_boot_httbr();
 	pgd_ptr = kvm_mmu_get_httbr();
 	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
 	hyp_stack_ptr = stack_page + PAGE_SIZE;
 	vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
 
-	__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
 	__cpu_init_stage2();
 
 	kvm_arm_init_debug();
@@ -1075,15 +1073,8 @@ static void cpu_hyp_reinit(void)
 
 static void cpu_hyp_reset(void)
 {
-	phys_addr_t boot_pgd_ptr;
-	phys_addr_t phys_idmap_start;
-
-	if (!is_kernel_in_hyp_mode()) {
-		boot_pgd_ptr = kvm_mmu_get_boot_httbr();
-		phys_idmap_start = kvm_get_idmap_start();
-
-		__cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
-	}
+	if (!is_kernel_in_hyp_mode())
+		__cpu_reset_hyp_mode(kvm_get_idmap_start());
 }
 
 static void _kvm_arch_hardware_enable(void *discard)
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 80d3737d68d205..dd4ccc7f7baf7e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1685,14 +1685,6 @@ phys_addr_t kvm_mmu_get_httbr(void)
 		return virt_to_phys(hyp_pgd);
 }
 
-phys_addr_t kvm_mmu_get_boot_httbr(void)
-{
-	if (__kvm_cpu_uses_extended_idmap())
-		return virt_to_phys(merged_hyp_pgd);
-	else
-		return virt_to_phys(hyp_pgd);
-}
-
 phys_addr_t kvm_get_idmap_vector(void)
 {
 	return hyp_idmap_vector;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 88462c30751024..6731d4e1c7464e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -347,8 +347,7 @@ int kvm_perf_teardown(void);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
-				       phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 				       unsigned long hyp_stack_ptr,
 				       unsigned long vector_ptr)
 {
@@ -360,8 +359,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
 }
 
 void __kvm_hyp_teardown(void);
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
-					phys_addr_t phys_idmap_start)
+static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start)
 {
 	kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
 }
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 2970537161d227..390acabdb1b2bb 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -170,7 +170,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 phys_addr_t kvm_get_idmap_start(void);
 int kvm_mmu_init(void);

From 26781f9ce16801a9c680dae1a7c1ca2fd3d112bd Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:46 +0100
Subject: [PATCH 197/302] arm/arm64: KVM: Kill free_boot_hyp_pgd

There is no way to free the boot PGD, because it doesn't exist
anymore as a standalone entity.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   |  1 -
 arch/arm/kvm/arm.c               |  4 ----
 arch/arm/kvm/mmu.c               | 30 +++++++-----------------------
 arch/arm64/include/asm/kvm_mmu.h |  1 -
 4 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5d161d13b4712c..d5fd9fb8550b3d 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -51,7 +51,6 @@
 
 int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
 void stage2_unmap_vm(struct kvm *kvm);
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 0887cc12b4019b..9b8c53798f5078 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1323,10 +1323,6 @@ static int init_hyp_mode(void)
 		}
 	}
 
-#ifndef CONFIG_HOTPLUG_CPU
-	free_boot_hyp_pgd();
-#endif
-
 	/* set size of VMID supported by CPU */
 	kvm_vmid_bits = kvm_get_vmid_bits();
 	kvm_info("%d-bit VMID\n", kvm_vmid_bits);
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index dd4ccc7f7baf7e..0b36dd52af62b1 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -481,27 +481,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
 	} while (pgd++, addr = next, addr != end);
 }
 
-/**
- * free_boot_hyp_pgd - free HYP boot page tables
- *
- * Free the HYP boot page tables. The bounce page is also freed.
- */
-void free_boot_hyp_pgd(void)
-{
-	mutex_lock(&kvm_hyp_pgd_mutex);
-
-	if (boot_hyp_pgd) {
-		unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-		boot_hyp_pgd = NULL;
-	}
-
-	if (hyp_pgd)
-		unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
 /**
  * free_hyp_pgds - free Hyp-mode page tables
  *
@@ -516,11 +495,16 @@ void free_hyp_pgds(void)
 {
 	unsigned long addr;
 
-	free_boot_hyp_pgd();
-
 	mutex_lock(&kvm_hyp_pgd_mutex);
 
+	if (boot_hyp_pgd) {
+		unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+		boot_hyp_pgd = NULL;
+	}
+
 	if (hyp_pgd) {
+		unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
 		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
 			unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
 		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 390acabdb1b2bb..b89122ed827d40 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -156,7 +156,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
 
 int create_hyp_mappings(void *from, void *to, pgprot_t prot);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
 void stage2_unmap_vm(struct kvm *kvm);

From cd602a37e80c791adf2a256d2aedec60b898cd51 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:47 +0100
Subject: [PATCH 198/302] arm: KVM: Simplify HYP init

Just like for arm64, we can now make the HYP setup a lot simpler,
and we can now initialise it in one go (instead of the two
phases we currently have).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_host.h | 15 ++++------
 arch/arm/kvm/init.S             | 49 ++++++---------------------------
 2 files changed, 14 insertions(+), 50 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 020f4eb14f0ba0..eafbfd5ad34aa6 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -250,18 +250,13 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 	 * code. The init code doesn't need to preserve these
 	 * registers as r0-r3 are already callee saved according to
 	 * the AAPCS.
-	 * Note that we slightly misuse the prototype by casing the
+	 * Note that we slightly misuse the prototype by casting the
 	 * stack pointer to a void *.
-	 *
-	 * We don't have enough registers to perform the full init in
-	 * one go.  Install the boot PGD first, and then install the
-	 * runtime PGD, stack pointer and vectors. The PGDs are always
-	 * passed as the third argument, in order to be passed into
-	 * r2-r3 to the init code (yes, this is compliant with the
-	 * PCS!).
-	 */
 
-	kvm_call_hyp(NULL, 0, boot_pgd_ptr);
+	 * The PGDs are always passed as the third argument, in order
+	 * to be passed into r2-r3 to the init code (yes, this is
+	 * compliant with the PCS!).
+	 */
 
 	kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
 }
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 1f9ae17476f908..b82a99dcfb61c3 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -32,23 +32,13 @@
  *       r2,r3 = Hypervisor pgd pointer
  *
  * The init scenario is:
- * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
- *   runtime stack, runtime vectors
- * - Enable the MMU with the boot pgd
- * - Jump to a target into the trampoline page (remember, this is the same
- *   physical page!)
- * - Now switch to the runtime pgd (same VA, and still the same physical
- *   page!)
+ * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack,
+ *   runtime vectors
  * - Invalidate TLBs
  * - Set stack and vectors
+ * - Setup the page tables
+ * - Enable the MMU
  * - Profit! (or eret, if you only care about the code).
- *
- * As we only have four registers available to pass parameters (and we
- * need six), we split the init in two phases:
- * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
- *   Provides the basic HYP init, and enable the MMU.
- * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
- *   Switches to the runtime PGD, set stack and vectors.
  */
 
 	.text
@@ -68,8 +58,11 @@ __kvm_hyp_init:
 	W(b)	.
 
 __do_hyp_init:
-	cmp	r0, #0			@ We have a SP?
-	bne	phase2			@ Yes, second stage init
+	@ Set stack pointer
+	mov	sp, r0
+
+	@ Set HVBAR to point to the HYP vectors
+	mcr	p15, 4, r1, c12, c0, 0	@ HVBAR
 
 	@ Set the HTTBR to point to the hypervisor PGD pointer passed
 	mcrr	p15, 4, rr_lo_hi(r2, r3), c2
@@ -114,33 +107,9 @@ __do_hyp_init:
  THUMB(	ldr	r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)		)
 	orr	r1, r1, r2
 	orr	r0, r0, r1
-	isb
 	mcr	p15, 4, r0, c1, c0, 0	@ HSCR
-
-	@ End of init phase-1
-	eret
-
-phase2:
-	@ Set stack pointer
-	mov	sp, r0
-
-	@ Set HVBAR to point to the HYP vectors
-	mcr	p15, 4, r1, c12, c0, 0	@ HVBAR
-
-	@ Jump to the trampoline page
-	ldr	r0, =TRAMPOLINE_VA
-	adr	r1, target
-	bfi	r0, r1, #0, #PAGE_SHIFT
-	ret	r0
-
-target:	@ We're now in the trampoline code, switch page tables
-	mcrr	p15, 4, rr_lo_hi(r2, r3), c2
 	isb
 
-	@ Invalidate the old TLBs
-	mcr	p15, 4, r0, c8, c7, 0	@ TLBIALLH
-	dsb	ish
-
 	eret
 
 	.ltorg

From e537ecd7efacaa7512e87ecb07c0c0335a902558 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:48 +0100
Subject: [PATCH 199/302] arm: KVM: Allow hyp teardown

So far, KVM was getting in the way of kexec on 32bit (and the arm64
kexec hackers couldn't be bothered to fix it on 32bit...).

With simpler page tables, tearing KVM down becomes very easy, so
let's just do it.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_asm.h    |  2 ++
 arch/arm/include/asm/kvm_host.h   |  8 +++-----
 arch/arm/kvm/arm.c                |  3 ++-
 arch/arm/kvm/init.S               | 15 +++++++++++++++
 arch/arm64/include/asm/kvm_host.h |  3 ++-
 5 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 3d5a5cd071bd15..58faff5f1eb2f3 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -66,6 +66,8 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern void __init_stage2_translation(void);
+
+extern void __kvm_hyp_reset(unsigned long);
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index eafbfd5ad34aa6..58d0b69e7428ce 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -266,12 +266,10 @@ static inline void __cpu_init_stage2(void)
 	kvm_call_hyp(__init_stage2_translation);
 }
 
-static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start)
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
+					phys_addr_t phys_idmap_start)
 {
-	/*
-	 * TODO
-	 * kvm_call_reset(phys_idmap_start);
-	 */
+	kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr);
 }
 
 static inline int kvm_arch_dev_ioctl_check_extension(long ext)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9b8c53798f5078..7cf266c502d6de 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1074,7 +1074,8 @@ static void cpu_hyp_reinit(void)
 static void cpu_hyp_reset(void)
 {
 	if (!is_kernel_in_hyp_mode())
-		__cpu_reset_hyp_mode(kvm_get_idmap_start());
+		__cpu_reset_hyp_mode(hyp_default_vectors,
+				     kvm_get_idmap_start());
 }
 
 static void _kvm_arch_hardware_enable(void *discard)
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index b82a99dcfb61c3..bf89c919efc1a2 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -112,6 +112,21 @@ __do_hyp_init:
 
 	eret
 
+	@ r0 : stub vectors address
+ENTRY(__kvm_hyp_reset)
+	/* We're now in idmap, disable MMU */
+	mrc	p15, 4, r1, c1, c0, 0	@ HSCTLR
+	ldr	r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
+	bic	r1, r1, r2
+	mcr	p15, 4, r1, c1, c0, 0	@ HSCTLR
+
+	/* Install stub vectors */
+	mcr	p15, 4, r0, c12, c0, 0	@ HVBAR
+	isb
+
+	eret
+ENDPROC(__kvm_hyp_reset)
+
 	.ltorg
 
 	.globl __kvm_hyp_init_end
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 6731d4e1c7464e..69d5cc2d2e17f9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -359,7 +359,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
 }
 
 void __kvm_hyp_teardown(void);
-static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start)
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
+					phys_addr_t phys_idmap_start)
 {
 	kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
 }

From f7bec68d2faed8180d7172cdbd69d99e3cad1387 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:49 +0100
Subject: [PATCH 200/302] arm/arm64: KVM: Prune unused #defines

We can now remove a number of dead #defines, thanks to the trampoline
code being gone.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   |  9 ---------
 arch/arm64/include/asm/kvm_mmu.h | 10 ----------
 2 files changed, 19 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index d5fd9fb8550b3d..73c28180bb6353 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -26,17 +26,8 @@
  * We directly use the kernel VA for the HYP, as we can directly share
  * the mapping (HTTBR "covers" TTBR1).
  */
-#define HYP_PAGE_OFFSET_MASK	UL(~0)
-#define HYP_PAGE_OFFSET		PAGE_OFFSET
 #define KERN_TO_HYP(kva)	(kva)
 
-/*
- * Our virtual mapping for the boot-time MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the vectors
- * page, where no kernel data will ever be shared with HYP.
- */
-#define TRAMPOLINE_VA		UL(CONFIG_VECTORS_BASE)
-
 /*
  * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
  */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b89122ed827d40..9226f8be634153 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -72,16 +72,6 @@
 #define HYP_PAGE_OFFSET_HIGH_MASK	((UL(1) << VA_BITS) - 1)
 #define HYP_PAGE_OFFSET_LOW_MASK	((UL(1) << (VA_BITS - 1)) - 1)
 
-/* Temporary compat define */
-#define HYP_PAGE_OFFSET_MASK		HYP_PAGE_OFFSET_HIGH_MASK
-
-/*
- * Our virtual mapping for the idmap-ed MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the last
- * possible page, where no kernel mapping will ever exist.
- */
-#define TRAMPOLINE_VA		(HYP_PAGE_OFFSET_MASK & PAGE_MASK)
-
 #ifdef __ASSEMBLY__
 
 #include <asm/alternative.h>

From eac378a9ceb7196b776a965d915e02995fb8ba55 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:50 +0100
Subject: [PATCH 201/302] arm/arm64: KVM: Check that IDMAP doesn't intersect
 with VA range

This is more of a safety measure than anything else: If we end-up
with an idmap page that intersect with the range picked for the
the HYP VA space, abort the KVM setup, as it is unsafe to go
further.

I cannot imagine it happening on 64bit (we have a mechanism to
work around it), but could potentially occur on a 32bit system with
the kernel loaded high enough in memory so that in conflicts with
the kernel VA.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/kvm/mmu.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 0b36dd52af62b1..8a0aa37605c55e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1709,6 +1709,21 @@ int kvm_mmu_init(void)
 	 */
 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
+	kvm_info("IDMAP page: %lx\n", hyp_idmap_start);
+	kvm_info("HYP VA range: %lx:%lx\n",
+		 KERN_TO_HYP(PAGE_OFFSET), KERN_TO_HYP(~0UL));
+
+	if (hyp_idmap_start >= KERN_TO_HYP(PAGE_OFFSET) &&
+	    hyp_idmap_start <  KERN_TO_HYP(~0UL)) {
+		/*
+		 * The idmap page is intersecting with the VA space,
+		 * it is not safe to continue further.
+		 */
+		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+		err = -EINVAL;
+		goto out;
+	}
+
 	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
 	if (!hyp_pgd) {
 		kvm_err("Hyp mode PGD not allocated\n");

From 6c41a413fd44af8eae2949869d4d57ce681a0c30 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 30 Jun 2016 18:40:51 +0100
Subject: [PATCH 202/302] arm/arm64: Get rid of KERN_TO_HYP

We have both KERN_TO_HYP and kern_hyp_va, which do the exact same
thing. Let's standardize on the latter.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_hyp.h   |  2 --
 arch/arm/include/asm/kvm_mmu.h   |  2 +-
 arch/arm/kvm/mmu.c               | 18 +++++++++---------
 arch/arm64/include/asm/kvm_mmu.h |  1 -
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index e38fce2270ac9c..6eaff28f2ff3a9 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -25,8 +25,6 @@
 
 #define __hyp_text __section(.hyp.text) notrace
 
-#define kern_hyp_va(v) (v)
-
 #define __ACCESS_CP15(CRn, Op1, CRm, Op2)	\
 	"mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
 #define __ACCESS_CP15_64(Op1, CRm)		\
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 73c28180bb6353..3bb803d6814b2a 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -26,7 +26,7 @@
  * We directly use the kernel VA for the HYP, as we can directly share
  * the mapping (HTTBR "covers" TTBR1).
  */
-#define KERN_TO_HYP(kva)	(kva)
+#define kern_hyp_va(kva)	(kva)
 
 /*
  * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 8a0aa37605c55e..bda27b6b1aa2b5 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -506,9 +506,9 @@ void free_hyp_pgds(void)
 	if (hyp_pgd) {
 		unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
 		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-			unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+			unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
 		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-			unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+			unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
 
 		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
 		hyp_pgd = NULL;
@@ -670,8 +670,8 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 {
 	phys_addr_t phys_addr;
 	unsigned long virt_addr;
-	unsigned long start = KERN_TO_HYP((unsigned long)from);
-	unsigned long end = KERN_TO_HYP((unsigned long)to);
+	unsigned long start = kern_hyp_va((unsigned long)from);
+	unsigned long end = kern_hyp_va((unsigned long)to);
 
 	if (is_kernel_in_hyp_mode())
 		return 0;
@@ -705,8 +705,8 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
  */
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 {
-	unsigned long start = KERN_TO_HYP((unsigned long)from);
-	unsigned long end = KERN_TO_HYP((unsigned long)to);
+	unsigned long start = kern_hyp_va((unsigned long)from);
+	unsigned long end = kern_hyp_va((unsigned long)to);
 
 	if (is_kernel_in_hyp_mode())
 		return 0;
@@ -1711,10 +1711,10 @@ int kvm_mmu_init(void)
 
 	kvm_info("IDMAP page: %lx\n", hyp_idmap_start);
 	kvm_info("HYP VA range: %lx:%lx\n",
-		 KERN_TO_HYP(PAGE_OFFSET), KERN_TO_HYP(~0UL));
+		 kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
 
-	if (hyp_idmap_start >= KERN_TO_HYP(PAGE_OFFSET) &&
-	    hyp_idmap_start <  KERN_TO_HYP(~0UL)) {
+	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+	    hyp_idmap_start <  kern_hyp_va(~0UL)) {
 		/*
 		 * The idmap page is intersecting with the VA space,
 		 * it is not safe to continue further.
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 9226f8be634153..b6bb83400cd8e2 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -133,7 +133,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
 }
 
 #define kern_hyp_va(v) 	(typeof(v))(__kern_hyp_va((unsigned long)(v)))
-#define KERN_TO_HYP(v)	kern_hyp_va(v)
 
 /*
  * We currently only support a 40bit IPA.

From 5ffe466cd3a33543306c37a0789e2116286367f1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 24 May 2016 12:10:27 +0200
Subject: [PATCH 203/302] KVM: s390: inject PER i-fetch events on applicable
 icpts

In case we have to emuluate an instruction or part of it (instruction,
partial instruction, operation exception), we have to inject a PER
instruction-fetching event for that instruction, if hardware told us to do
so.

In case we retry an instruction, we must not inject the PER event.

Please note that we don't filter the events properly yet, so guest
debugging will be visible for the guest.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/guestdbg.c  | 17 +++++++++++++++++
 arch/s390/kvm/intercept.c | 17 ++++++++++++++---
 arch/s390/kvm/kvm-s390.h  |  3 +++
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 1e0849e209650d..31a05330d11c77 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -439,6 +439,23 @@ static int debug_exit_required(struct kvm_vcpu *vcpu)
 #define guest_per_enabled(vcpu) \
 			     (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
 
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+	const u8 ilen = kvm_s390_get_ilen(vcpu);
+	struct kvm_s390_pgm_info pgm_info = {
+		.code = PGM_PER,
+		.per_code = PER_EVENT_IFETCH >> 24,
+		.per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+	};
+
+	/*
+	 * The PSW points to the next instruction, therefore the intercepted
+	 * instruction generated a PER i-fetch event. PER address therefore
+	 * points at the previous PSW address (could be an EXECUTE function).
+	 */
+	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
 static void filter_guest_per_event(struct kvm_vcpu *vcpu)
 {
 	u32 perc = vcpu->arch.sie_block->perc << 24;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 9359f65c8634ba..850be47c4cc93f 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -364,6 +364,8 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
+	int rc, per_rc = 0;
+
 	if (kvm_is_ucontrol(vcpu->kvm))
 		return -EOPNOTSUPP;
 
@@ -372,7 +374,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 	case 0x18:
 		return handle_noop(vcpu);
 	case 0x04:
-		return handle_instruction(vcpu);
+		rc = handle_instruction(vcpu);
+		break;
 	case 0x08:
 		return handle_prog(vcpu);
 	case 0x14:
@@ -384,10 +387,18 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 	case 0x28:
 		return handle_stop(vcpu);
 	case 0x2c:
-		return handle_operexc(vcpu);
+		rc = handle_operexc(vcpu);
+		break;
 	case 0x38:
-		return handle_partial_execution(vcpu);
+		rc = handle_partial_execution(vcpu);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
+
+	/* process PER, also if the instrution is processed in user space */
+	if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+	    (!rc || rc == -EOPNOTSUPP))
+		per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+	return per_rc ? per_rc : rc;
 }
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 031f451bb2cf79..b8432862a81715 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -238,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
 }
 static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
 {
+	/* don't inject PER events if we re-execute the instruction */
+	vcpu->arch.sie_block->icptstatus &= ~0x02;
 	kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
 }
 
@@ -377,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
 			    struct kvm_guest_debug *dbg);
 void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
 void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
 void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
 
 /* support for Basic/Extended SCA handling */

From 92176a8ede577d0ff78ab3298e06701f67ad5f51 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 7 Jun 2016 16:22:47 +0200
Subject: [PATCH 204/302] KVM: MMU: prepare to support mapping of VM_IO and
 VM_PFNMAP frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Handle VM_IO like VM_PFNMAP, as is common in the rest of Linux; extract
the formula to convert hva->pfn into a new function, which will soon
gain more capabilities.

Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ef54b4c3179262..5aae59e00bef83 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1442,6 +1442,16 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 	return true;
 }
 
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+			       unsigned long addr, bool *async,
+			       bool write_fault, kvm_pfn_t *p_pfn)
+{
+	*p_pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+		vma->vm_pgoff;
+	BUG_ON(!kvm_is_reserved_pfn(*p_pfn));
+	return 0;
+}
+
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
@@ -1461,7 +1471,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 {
 	struct vm_area_struct *vma;
 	kvm_pfn_t pfn = 0;
-	int npages;
+	int npages, r;
 
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
@@ -1487,10 +1497,10 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 
 	if (vma == NULL)
 		pfn = KVM_PFN_ERR_FAULT;
-	else if ((vma->vm_flags & VM_PFNMAP)) {
-		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-			vma->vm_pgoff;
-		BUG_ON(!kvm_is_reserved_pfn(pfn));
+	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+		r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+		if (r < 0)
+			pfn = KVM_PFN_ERR_FAULT;
 	} else {
 		if (async && vma_is_valid(vma, write_fault))
 			*async = true;

From add6a0cd1c5ba51b201e1361b05a5df817083618 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 7 Jun 2016 17:51:18 +0200
Subject: [PATCH 205/302] KVM: MMU: try to fix up page faults before giving up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The vGPU folks would like to trap the first access to a BAR by setting
vm_ops on the VMAs produced by mmap-ing a VFIO device.  The fault handler
then can use remap_pfn_range to place some non-reserved pages in the VMA.

This kind of VM_PFNMAP mapping is not handled by KVM, but follow_pfn
and fixup_user_fault together help supporting it.  The patch also supports
VM_MIXEDMAP vmas where the pfns are not reserved and thus subject to
reference counting.

Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Neo Jia <cjia@nvidia.com>
Reported-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 mm/gup.c            |  1 +
 virt/kvm/kvm_main.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index c057784c844456..e3ac22f90fa4be 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -720,6 +720,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(fixup_user_fault);
 
 static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 						struct mm_struct *mm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5aae59e00bef83..154b9ab459b0a3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1446,9 +1446,45 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 			       unsigned long addr, bool *async,
 			       bool write_fault, kvm_pfn_t *p_pfn)
 {
-	*p_pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-		vma->vm_pgoff;
-	BUG_ON(!kvm_is_reserved_pfn(*p_pfn));
+	unsigned long pfn;
+	int r;
+
+	r = follow_pfn(vma, addr, &pfn);
+	if (r) {
+		/*
+		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+		 * not call the fault handler, so do it here.
+		 */
+		bool unlocked = false;
+		r = fixup_user_fault(current, current->mm, addr,
+				     (write_fault ? FAULT_FLAG_WRITE : 0),
+				     &unlocked);
+		if (unlocked)
+			return -EAGAIN;
+		if (r)
+			return r;
+
+		r = follow_pfn(vma, addr, &pfn);
+		if (r)
+			return r;
+
+	}
+
+
+	/*
+	 * Get a reference here because callers of *hva_to_pfn* and
+	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+	 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+	 * simply do nothing for reserved pfns.
+	 *
+	 * Whoever called remap_pfn_range is also going to call e.g.
+	 * unmap_mapping_range before the underlying pages are freed,
+	 * causing a call to our MMU notifier.
+	 */ 
+	kvm_get_pfn(pfn);
+
+	*p_pfn = pfn;
 	return 0;
 }
 
@@ -1493,12 +1529,15 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		goto exit;
 	}
 
+retry:
 	vma = find_vma_intersection(current->mm, addr, addr + 1);
 
 	if (vma == NULL)
 		pfn = KVM_PFN_ERR_FAULT;
 	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
 		r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+		if (r == -EAGAIN)
+			goto retry;
 		if (r < 0)
 			pfn = KVM_PFN_ERR_FAULT;
 	} else {

From 03f6a22a3979f20575c39ff86c79b3fa3b7f469d Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 4 Jul 2016 15:13:07 +0000
Subject: [PATCH 206/302] KVM: x86: Use ARRAY_SIZE instead of dividing sizeof
 array with sizeof an element

Use ARRAY_SIZE instead of dividing sizeof array with sizeof an element

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 85e2f0a882ca99..e564fa2c7ac881 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1112,7 +1112,7 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void)
 
 	/* Clear the reserved bits */
 	eax &= ~(0x3U << 14 | 0xfU << 28);
-	for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++)
+	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
 		if (eax == vmx_preemption_cpu_tfms[i])
 			return true;
 

From c29732a179c2ed0cb9f001a8dc07dcf432389313 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:34 +0100
Subject: [PATCH 207/302] MIPS: uasm: Add CFC1/CTC1 instructions

Add CFC1/CTC1 instructions for accessing FP control registers to uasm so
that KVM can use uasm for generating its entry point code at runtime.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/uasm.h  |  2 ++
 arch/mips/mm/uasm-micromips.c |  8 ++++++--
 arch/mips/mm/uasm-mips.c      |  2 ++
 arch/mips/mm/uasm.c           | 26 ++++++++++++++------------
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index b6ecfeee4dbe8a..3153ada46e9a09 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -104,6 +104,8 @@ Ip_u1s2(_bltz);
 Ip_u1s2(_bltzl);
 Ip_u1u2s3(_bne);
 Ip_u2s3u1(_cache);
+Ip_u1u2(_cfc1);
+Ip_u1u2(_ctc1);
 Ip_u2u1s3(_daddiu);
 Ip_u3u1u2(_daddu);
 Ip_u2u1msbu3(_dins);
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index d78178daea4bc2..8b1acb2f6b8b13 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -53,6 +53,8 @@ static struct insn insn_table_MM[] = {
 	{ insn_bltzl, 0, 0 },
 	{ insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM },
 	{ insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM },
+	{ insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS },
+	{ insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS },
 	{ insn_daddu, 0, 0 },
 	{ insn_daddiu, 0, 0 },
 	{ insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
@@ -166,13 +168,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...)
 	op = ip->match;
 	va_start(ap, opc);
 	if (ip->fields & RS) {
-		if (opc == insn_mfc0 || opc == insn_mtc0)
+		if (opc == insn_mfc0 || opc == insn_mtc0 ||
+		    opc == insn_cfc1 || opc == insn_ctc1)
 			op |= build_rt(va_arg(ap, u32));
 		else
 			op |= build_rs(va_arg(ap, u32));
 	}
 	if (ip->fields & RT) {
-		if (opc == insn_mfc0 || opc == insn_mtc0)
+		if (opc == insn_mfc0 || opc == insn_mtc0 ||
+		    opc == insn_cfc1 || opc == insn_ctc1)
 			op |= build_rs(va_arg(ap, u32));
 		else
 			op |= build_rt(va_arg(ap, u32));
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 9c2220a45189a6..5152544962c37e 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -67,6 +67,8 @@ static struct insn insn_table[] = {
 #else
 	{ insn_cache,  M6(cache_op, 0, 0, 0, cache6_op),  RS | RT | SIMM9 },
 #endif
+	{ insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD },
+	{ insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD },
 	{ insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
 	{ insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
 	{ insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index ad718debc35a74..4731893db3f709 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -49,18 +49,18 @@ enum opcode {
 	insn_invalid,
 	insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
 	insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
-	insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm,
-	insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
-	insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret,
-	insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
-	insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
-	insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
-	insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe,
-	insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt,
-	insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu,
-	insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi,
-	insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield,
-	insn_lddir, insn_ldpte,
+	insn_bne, insn_cache, insn_cfc1, insn_ctc1, insn_daddiu, insn_daddu,
+	insn_dins, insn_dinsm, insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr,
+	insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32,
+	insn_dsubu, insn_eret, insn_ext, insn_ins, insn_j, insn_jal, insn_jalr,
+	insn_jr, insn_lb, insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld,
+	insn_lui, insn_lw, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi,
+	insn_mflo, insn_mtc0, insn_mthc0, insn_mul, insn_or, insn_ori,
+	insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
+	insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
+	insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
+	insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
+	insn_xori, insn_yield, insn_lddir, insn_ldpte,
 };
 
 struct insn {
@@ -268,6 +268,8 @@ I_u1s2(_bltz)
 I_u1s2(_bltzl)
 I_u1u2s3(_bne)
 I_u2s3u1(_cache)
+I_u1u2(_cfc1)
+I_u1u2(_ctc1)
 I_u1u2u3(_dmfc0)
 I_u1u2u3(_dmtc0)
 I_u2u1s3(_daddiu)

From 59e3559f48dcad3051f60c32775e028cd999ae53 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:35 +0100
Subject: [PATCH 208/302] MIPS: uasm: Add CFCMSA/CTCMSA instructions

Add CFCMSA/CTCMSA instructions for accessing MSA control registers to
uasm so that KVM can use uasm for generating its entry point code at
runtime.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/uasm.h      |  2 ++
 arch/mips/include/uapi/asm/inst.h | 24 +++++++++++++++++++++++-
 arch/mips/mm/uasm-micromips.c     |  2 ++
 arch/mips/mm/uasm-mips.c          |  2 ++
 arch/mips/mm/uasm.c               | 26 ++++++++++++++------------
 5 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index 3153ada46e9a09..edc02687016ea6 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -105,7 +105,9 @@ Ip_u1s2(_bltzl);
 Ip_u1u2s3(_bne);
 Ip_u2s3u1(_cache);
 Ip_u1u2(_cfc1);
+Ip_u2u1(_cfcmsa);
 Ip_u1u2(_ctc1);
+Ip_u2u1(_ctcmsa);
 Ip_u2u1s3(_daddiu);
 Ip_u3u1u2(_daddu);
 Ip_u2u1msbu3(_dins);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index a1ebf973725c79..2e624dd058ef15 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -237,6 +237,21 @@ enum bshfl_func {
 	seh_op  = 0x18,
 };
 
+/*
+ * MSA minor opcodes.
+ */
+enum msa_func {
+	msa_elm_op = 0x19,
+};
+
+/*
+ * MSA ELM opcodes.
+ */
+enum msa_elm {
+	msa_ctc_op = 0x3e,
+	msa_cfc_op = 0x7e,
+};
+
 /*
  * func field for MSA MI10 format.
  */
@@ -264,7 +279,7 @@ enum mm_major_op {
 	mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op,
 	mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op,
 	mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op,
-	mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op,
+	mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op,
 	mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op,
 	mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op,
 	mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op,
@@ -478,6 +493,13 @@ enum mm_32f_73_minor_op {
 	mm_fcvts1_op = 0xed,
 };
 
+/*
+ * (microMIPS) POOL32S minor opcodes.
+ */
+enum mm_32s_minor_op {
+	mm_32s_elm_op = 0x16,
+};
+
 /*
  * (microMIPS) POOL16C minor opcodes.
  */
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index 8b1acb2f6b8b13..eba5018961eb6b 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -54,7 +54,9 @@ static struct insn insn_table_MM[] = {
 	{ insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM },
 	{ insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM },
 	{ insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS },
+	{ insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE },
 	{ insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS },
+	{ insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE },
 	{ insn_daddu, 0, 0 },
 	{ insn_daddiu, 0, 0 },
 	{ insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 5152544962c37e..9f77783f23a696 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -68,7 +68,9 @@ static struct insn insn_table[] = {
 	{ insn_cache,  M6(cache_op, 0, 0, 0, cache6_op),  RS | RT | SIMM9 },
 #endif
 	{ insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD },
+	{ insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE },
 	{ insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD },
+	{ insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE },
 	{ insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
 	{ insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
 	{ insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index 4731893db3f709..3affd08a262b17 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -49,18 +49,18 @@ enum opcode {
 	insn_invalid,
 	insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
 	insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
-	insn_bne, insn_cache, insn_cfc1, insn_ctc1, insn_daddiu, insn_daddu,
-	insn_dins, insn_dinsm, insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr,
-	insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32,
-	insn_dsubu, insn_eret, insn_ext, insn_ins, insn_j, insn_jal, insn_jalr,
-	insn_jr, insn_lb, insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld,
-	insn_lui, insn_lw, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi,
-	insn_mflo, insn_mtc0, insn_mthc0, insn_mul, insn_or, insn_ori,
-	insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
-	insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
-	insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
-	insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
-	insn_xori, insn_yield, insn_lddir, insn_ldpte,
+	insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa,
+	insn_daddiu, insn_daddu, insn_dins, insn_dinsm, insn_divu, insn_dmfc0,
+	insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, insn_dsll32, insn_dsra,
+	insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, insn_ext, insn_ins,
+	insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_ld, insn_ldx,
+	insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, insn_lwx, insn_mfc0,
+	insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, insn_mthc0, insn_mul,
+	insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd,
+	insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra,
+	insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall,
+	insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh,
+	insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte,
 };
 
 struct insn {
@@ -269,7 +269,9 @@ I_u1s2(_bltzl)
 I_u1u2s3(_bne)
 I_u2s3u1(_cache)
 I_u1u2(_cfc1)
+I_u2u1(_cfcmsa)
 I_u1u2(_ctc1)
+I_u2u1(_ctcmsa)
 I_u1u2u3(_dmfc0)
 I_u1u2u3(_dmtc0)
 I_u2u1s3(_daddiu)

From 61c64cf99ae589af3835dbc9bb57200d4a4842ae Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:36 +0100
Subject: [PATCH 209/302] MIPS: uasm: Add DI instruction

Add DI instruction for disabling interrupts to uasm so that KVM can use
uasm for generating its entry point code at runtime.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/uasm.h      |  1 +
 arch/mips/include/uapi/asm/inst.h |  1 +
 arch/mips/mm/uasm-micromips.c     |  1 +
 arch/mips/mm/uasm-mips.c          |  1 +
 arch/mips/mm/uasm.c               | 23 ++++++++++++-----------
 5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index edc02687016ea6..4af8a5becbbbfa 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -110,6 +110,7 @@ Ip_u1u2(_ctc1);
 Ip_u2u1(_ctcmsa);
 Ip_u2u1s3(_daddiu);
 Ip_u3u1u2(_daddu);
+Ip_u1(_di);
 Ip_u2u1msbu3(_dins);
 Ip_u2u1msbu3(_dinsm);
 Ip_u1u2(_divu);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 2e624dd058ef15..7010d0b7b7520e 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -376,6 +376,7 @@ enum mm_32axf_minor_op {
 	mm_jalrhb_op = 0x07c,
 	mm_tlbwi_op = 0x08d,
 	mm_tlbwr_op = 0x0cd,
+	mm_di_op = 0x11d,
 	mm_jalrs_op = 0x13c,
 	mm_jalrshb_op = 0x17c,
 	mm_sync_op = 0x1ad,
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index eba5018961eb6b..40bef28f192c38 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -59,6 +59,7 @@ static struct insn insn_table_MM[] = {
 	{ insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE },
 	{ insn_daddu, 0, 0 },
 	{ insn_daddiu, 0, 0 },
+	{ insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS },
 	{ insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
 	{ insn_dmfc0, 0, 0 },
 	{ insn_dmtc0, 0, 0 },
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 9f77783f23a696..2b7d85b8241fb2 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -74,6 +74,7 @@ static struct insn insn_table[] = {
 	{ insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
 	{ insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
 	{ insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
+	{ insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT },
 	{ insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE },
 	{ insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT },
 	{ insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET},
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index 3affd08a262b17..006fb05b74a7b2 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -50,17 +50,17 @@ enum opcode {
 	insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
 	insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
 	insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa,
-	insn_daddiu, insn_daddu, insn_dins, insn_dinsm, insn_divu, insn_dmfc0,
-	insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, insn_dsll32, insn_dsra,
-	insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, insn_ext, insn_ins,
-	insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_ld, insn_ldx,
-	insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, insn_lwx, insn_mfc0,
-	insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, insn_mthc0, insn_mul,
-	insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd,
-	insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra,
-	insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall,
-	insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh,
-	insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte,
+	insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu,
+	insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
+	insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret,
+	insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
+	insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
+	insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
+	insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr,
+	insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu,
+	insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync,
+	insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait,
+	insn_wsbh, insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte,
 };
 
 struct insn {
@@ -276,6 +276,7 @@ I_u1u2u3(_dmfc0)
 I_u1u2u3(_dmtc0)
 I_u2u1s3(_daddiu)
 I_u3u1u2(_daddu)
+I_u1(_di);
 I_u1u2(_divu)
 I_u2u1u3(_dsll)
 I_u2u1u3(_dsll32)

From 9f730a60e5a046230cff8c9f4c8eb73f6dca7d81 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:37 +0100
Subject: [PATCH 210/302] MIPS: uasm: Add MTHI/MTLO instructions

Add MTHI/MTLO instructions for writing to the hi & lo registers to uasm
so that KVM can use uasm for generating its entry point code at runtime.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/uasm.h      |  2 ++
 arch/mips/include/uapi/asm/inst.h |  2 ++
 arch/mips/mm/uasm-micromips.c     |  2 ++
 arch/mips/mm/uasm-mips.c          |  2 ++
 arch/mips/mm/uasm.c               | 13 ++++++++-----
 5 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index 4af8a5becbbbfa..f7929f65f7ca27 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -146,6 +146,8 @@ Ip_u1(_mfhi);
 Ip_u1(_mflo);
 Ip_u1u2u3(_mtc0);
 Ip_u1u2u3(_mthc0);
+Ip_u1(_mthi);
+Ip_u1(_mtlo);
 Ip_u3u1u2(_mul);
 Ip_u3u1u2(_or);
 Ip_u2u1u3(_ori);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 7010d0b7b7520e..6319c5037e669e 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -375,7 +375,9 @@ enum mm_32axf_minor_op {
 	mm_mflo32_op = 0x075,
 	mm_jalrhb_op = 0x07c,
 	mm_tlbwi_op = 0x08d,
+	mm_mthi32_op = 0x0b5,
 	mm_tlbwr_op = 0x0cd,
+	mm_mtlo32_op = 0x0f5,
 	mm_di_op = 0x11d,
 	mm_jalrs_op = 0x13c,
 	mm_jalrshb_op = 0x17c,
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index 40bef28f192c38..277cf52d80e189 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -89,6 +89,8 @@ static struct insn insn_table_MM[] = {
 	{ insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS },
 	{ insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS },
 	{ insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD },
+	{ insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS },
+	{ insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS },
 	{ insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD },
 	{ insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD },
 	{ insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM },
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 2b7d85b8241fb2..86a3c76a1ad868 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -119,6 +119,8 @@ static struct insn insn_table[] = {
 	{ insn_mflo,  M(spec_op, 0, 0, 0, 0, mflo_op), RD },
 	{ insn_mtc0,  M(cop0_op, mtc_op, 0, 0, 0, 0),  RT | RD | SET},
 	{ insn_mthc0,  M(cop0_op, mthc0_op, 0, 0, 0, 0),  RT | RD | SET},
+	{ insn_mthi,  M(spec_op, 0, 0, 0, 0, mthi_op), RS },
+	{ insn_mtlo,  M(spec_op, 0, 0, 0, 0, mtlo_op), RS },
 	{ insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
 	{ insn_ori,  M(ori_op, 0, 0, 0, 0, 0),	RS | RT | UIMM },
 	{ insn_or,  M(spec_op, 0, 0, 0, 0, or_op),  RS | RT | RD },
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index 006fb05b74a7b2..3e0282d301d61b 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -56,11 +56,12 @@ enum opcode {
 	insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
 	insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
 	insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
-	insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr,
-	insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu,
-	insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync,
-	insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait,
-	insn_wsbh, insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte,
+	insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori,
+	insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
+	insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
+	insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
+	insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
+	insn_xori, insn_yield, insn_lddir, insn_ldpte,
 };
 
 struct insn {
@@ -306,6 +307,8 @@ I_u1(_mfhi)
 I_u1(_mflo)
 I_u1u2u3(_mtc0)
 I_u1u2u3(_mthc0)
+I_u1(_mthi)
+I_u1(_mtlo)
 I_u3u1u2(_mul)
 I_u2u1u3(_ori)
 I_u3u1u2(_or)

From 6f63405cb67bc4424cd7cada11783dcef0f8b3c2 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:38 +0100
Subject: [PATCH 211/302] MIPS: uasm: Add r6 MUL encoding

Add the R6 MUL instruction encoding for 3 operand signed multiply to
uasm so that KVM can use uasm for generating its entry point code at
runtime on R6.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/uapi/asm/inst.h | 44 +++++++++++++++++++++++++++++++
 arch/mips/mm/uasm-mips.c          |  4 +++
 2 files changed, 48 insertions(+)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 6319c5037e669e..fc96012c75d1fc 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -92,6 +92,50 @@ enum spec3_op {
 	rdhwr_op  = 0x3b
 };
 
+/*
+ * Bits 10-6 minor opcode for r6 spec mult/div encodings
+ */
+enum mult_op {
+	mult_mult_op = 0x0,
+	mult_mul_op = 0x2,
+	mult_muh_op = 0x3,
+};
+enum multu_op {
+	multu_multu_op = 0x0,
+	multu_mulu_op = 0x2,
+	multu_muhu_op = 0x3,
+};
+enum div_op {
+	div_div_op = 0x0,
+	div_div6_op = 0x2,
+	div_mod_op = 0x3,
+};
+enum divu_op {
+	divu_divu_op = 0x0,
+	divu_divu6_op = 0x2,
+	divu_modu_op = 0x3,
+};
+enum dmult_op {
+	dmult_dmult_op = 0x0,
+	dmult_dmul_op = 0x2,
+	dmult_dmuh_op = 0x3,
+};
+enum dmultu_op {
+	dmultu_dmultu_op = 0x0,
+	dmultu_dmulu_op = 0x2,
+	dmultu_dmuhu_op = 0x3,
+};
+enum ddiv_op {
+	ddiv_ddiv_op = 0x0,
+	ddiv_ddiv6_op = 0x2,
+	ddiv_dmod_op = 0x3,
+};
+enum ddivu_op {
+	ddivu_ddivu_op = 0x0,
+	ddivu_ddivu6_op = 0x2,
+	ddivu_dmodu_op = 0x3,
+};
+
 /*
  * rt field of bcond opcodes.
  */
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 86a3c76a1ad868..cec52416782239 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -121,7 +121,11 @@ static struct insn insn_table[] = {
 	{ insn_mthc0,  M(cop0_op, mthc0_op, 0, 0, 0, 0),  RT | RD | SET},
 	{ insn_mthi,  M(spec_op, 0, 0, 0, 0, mthi_op), RS },
 	{ insn_mtlo,  M(spec_op, 0, 0, 0, 0, mtlo_op), RS },
+#ifndef CONFIG_CPU_MIPSR6
 	{ insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
+#else
+	{ insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD},
+#endif
 	{ insn_ori,  M(ori_op, 0, 0, 0, 0, 0),	RS | RT | UIMM },
 	{ insn_or,  M(spec_op, 0, 0, 0, 0, or_op),  RS | RT | RD },
 #ifndef CONFIG_CPU_MIPSR6

From 90e9311a34e7b88f246a6d741ef70e3fdba15a34 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:39 +0100
Subject: [PATCH 212/302] MIPS; KVM: Convert exception entry to uasm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert the whole of locore.S (assembly to enter guest and handle
exception entry) to be generated dynamically with uasm. This is done
with minimal changes to the resulting code.

The main changes are:
- Some constants are generated by uasm using LUI+ADDIU instead of
  LUI+ORI.
- Loading of lo and hi are swapped around in vcpu_run but not when
  resuming the guest after an exit. Both bits of logic are now generated
  by the same code.
- Register MOVEs in uasm use different ADDU operand ordering to GNU as,
  putting zero register into rs instead of rt.
- The JALR.HB to call the C exit handler is switched to JALR, since the
  hazard barrier would appear to be unnecessary.

This will allow further optimisation in the future to dynamically handle
the capabilities of the CPU.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |   8 +-
 arch/mips/kvm/Kconfig            |   1 +
 arch/mips/kvm/Makefile           |   2 +-
 arch/mips/kvm/entry.c            | 622 +++++++++++++++++++++++++++++++
 arch/mips/kvm/interrupt.h        |   4 -
 arch/mips/kvm/locore.S           | 602 ------------------------------
 arch/mips/kvm/mips.c             |  37 +-
 7 files changed, 642 insertions(+), 634 deletions(-)
 create mode 100644 arch/mips/kvm/entry.c
 delete mode 100644 arch/mips/kvm/locore.S

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b0773c6d622fa5..2e76e899079c1d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -533,8 +533,12 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
 /* Debug: dump vcpu state */
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
 
-/* Trampoline ASM routine to start running in "Guest" context */
-extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+/* Building of entry/exception code */
+void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_exception(void *addr);
+void *kvm_mips_build_exit(void *addr);
 
 /* FPU/MSA context management */
 void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu);
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 2ae12825529f8f..7c56d6b124d162 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -17,6 +17,7 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
+	select EXPORT_UASM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select KVM_MMIO
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 0aabe40fcac9b7..847429de780d3b 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -7,7 +7,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
 
 common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 
-kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
+kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
 	    interrupt.o stats.o commpage.o \
 	    dyntrans.o trap_emul.o fpu.o
 kvm-objs += mmu.o
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
new file mode 100644
index 00000000000000..9a18b4939b3539
--- /dev/null
+++ b/arch/mips/kvm/entry.c
@@ -0,0 +1,622 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Generation of main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ *
+ * Copyright (C) 2016 Imagination Technologies Ltd.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/msa.h>
+#include <asm/setup.h>
+#include <asm/uasm.h>
+
+/* Register names */
+#define ZERO		0
+#define AT		1
+#define V0		2
+#define V1		3
+#define A0		4
+#define A1		5
+
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#define T0		8
+#define T1		9
+#define T2		10
+#define T3		11
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
+
+#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32
+#define T0		12
+#define T1		13
+#define T2		14
+#define T3		15
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */
+
+#define S0		16
+#define S1		17
+#define T9		25
+#define K0		26
+#define K1		27
+#define GP		28
+#define SP		29
+#define RA		31
+
+/* Some CP0 registers */
+#define C0_HWRENA	7, 0
+#define C0_BADVADDR	8, 0
+#define C0_ENTRYHI	10, 0
+#define C0_STATUS	12, 0
+#define C0_CAUSE	13, 0
+#define C0_EPC		14, 0
+#define C0_EBASE	15, 1
+#define C0_CONFIG3	16, 3
+#define C0_CONFIG5	16, 5
+#define C0_DDATA_LO	28, 3
+#define C0_ERROREPC	30, 0
+
+#define CALLFRAME_SIZ   32
+
+enum label_id {
+	label_fpu_1 = 1,
+	label_msa_1,
+	label_return_to_host,
+	label_kernel_asid,
+};
+
+UASM_L_LA(_fpu_1)
+UASM_L_LA(_msa_1)
+UASM_L_LA(_return_to_host)
+UASM_L_LA(_kernel_asid)
+
+static void *kvm_mips_build_enter_guest(void *addr);
+static void *kvm_mips_build_ret_from_exit(void *addr);
+static void *kvm_mips_build_ret_to_guest(void *addr);
+static void *kvm_mips_build_ret_to_host(void *addr);
+
+/**
+ * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the start of the vcpu_run function to run a guest VCPU. The function
+ * conforms to the following prototype:
+ *
+ * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ *
+ * The exit from the guest and return to the caller is handled by the code
+ * generated by kvm_mips_build_ret_to_host().
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_vcpu_run(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+
+	/*
+	 * A0: run
+	 * A1: vcpu
+	 */
+
+	/* k0/k1 not being used in host kernel context */
+	uasm_i_addiu(&p, K1, SP, -(int)sizeof(struct pt_regs));
+	for (i = 16; i < 32; ++i) {
+		if (i == 24)
+			i = 28;
+		UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+	}
+
+	/* Save hi/lo */
+	uasm_i_mflo(&p, V0);
+	UASM_i_SW(&p, V0, offsetof(struct pt_regs, lo), K1);
+	uasm_i_mfhi(&p, V1);
+	UASM_i_SW(&p, V1, offsetof(struct pt_regs, hi), K1);
+
+	/* Save host status */
+	uasm_i_mfc0(&p, V0, C0_STATUS);
+	UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
+
+	/* Save DDATA_LO, will be used to store pointer to vcpu */
+	uasm_i_mfc0(&p, V1, C0_DDATA_LO);
+	UASM_i_SW(&p, V1, offsetof(struct pt_regs, cp0_epc), K1);
+
+	/* DDATA_LO has pointer to vcpu */
+	uasm_i_mtc0(&p, A1, C0_DDATA_LO);
+
+	/* Offset into vcpu->arch */
+	uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+
+	/*
+	 * Save the host stack to VCPU, used for exception processing
+	 * when we exit from the Guest
+	 */
+	UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+	/* Save the kernel gp as well */
+	UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+	/*
+	 * Setup status register for running the guest in UM, interrupts
+	 * are disabled
+	 */
+	UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV);
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	/* load up the new EBASE */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+	uasm_i_mtc0(&p, K0, C0_EBASE);
+
+	/*
+	 * Now that the new EBASE has been loaded, unset BEV, set
+	 * interrupt mask as it was but make sure that timer interrupts
+	 * are enabled
+	 */
+	uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE);
+	uasm_i_andi(&p, V0, V0, ST0_IM);
+	uasm_i_or(&p, K0, K0, V0);
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	p = kvm_mips_build_enter_guest(p);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_enter_guest() - Assemble code to resume guest execution.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to resume guest execution. This code is common between the
+ * initial entry into the guest from the host, and returning from the exit
+ * handler back to the guest.
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_enter_guest(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/* Set Guest EPC */
+	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
+	uasm_i_mtc0(&p, T0, C0_EPC);
+
+	/* Set the ASID for the Guest Kernel */
+	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
+	UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
+		  T0);
+	uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
+	uasm_i_xori(&p, T0, T0, KSU_USER);
+	uasm_il_bnez(&p, &r, T0, label_kernel_asid);
+	 uasm_i_addiu(&p, T1, K1,
+		      offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+	/* else user */
+	uasm_i_addiu(&p, T1, K1,
+		     offsetof(struct kvm_vcpu_arch, guest_user_asid));
+	uasm_l_kernel_asid(&l, p);
+
+	/* t1: contains the base of the ASID array, need to get the cpu id  */
+	/* smp_processor_id */
+	UASM_i_LW(&p, T2, offsetof(struct thread_info, cpu), GP);
+	/* x4 */
+	uasm_i_sll(&p, T2, T2, 2);
+	UASM_i_ADDU(&p, T3, T1, T2);
+	UASM_i_LW(&p, K0, 0, T3);
+#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
+	/* x sizeof(struct cpuinfo_mips)/4 */
+	uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+	uasm_i_mul(&p, T2, T2, T3);
+
+	UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
+	UASM_i_ADDU(&p, AT, AT, T2);
+	UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT);
+	uasm_i_and(&p, K0, K0, T2);
+#else
+	uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
+#endif
+	uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+	uasm_i_ehb(&p);
+
+	/* Disable RDHWR access */
+	uasm_i_mtc0(&p, ZERO, C0_HWRENA);
+
+	/* load the guest context from VCPU and return */
+	for (i = 1; i < 32; ++i) {
+		/* Guest k0/k1 loaded later */
+		if (i == K0 || i == K1)
+			continue;
+		UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+	}
+
+	/* Restore hi/lo */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1);
+	uasm_i_mthi(&p, K0);
+
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1);
+	uasm_i_mtlo(&p, K0);
+
+	/* Restore the guest's k0/k1 registers */
+	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+	UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+	/* Jump to guest */
+	uasm_i_eret(&p);
+
+	uasm_resolve_relocs(relocs, labels);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_exception() - Assemble first level guest exception handler.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble exception vector code for guest execution. The generated vector will
+ * jump to the common exception handler generated by kvm_mips_build_exit().
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_exception(void *addr)
+{
+	u32 *p = addr;
+
+	/* Save guest k0 */
+	uasm_i_mtc0(&p, K0, C0_ERROREPC);
+	uasm_i_ehb(&p);
+
+	/* Get EBASE */
+	uasm_i_mfc0(&p, K0, C0_EBASE);
+	/* Get rid of CPUNum */
+	uasm_i_srl(&p, K0, K0, 10);
+	uasm_i_sll(&p, K0, K0, 10);
+	/* Save k1 @ offset 0x3000 */
+	UASM_i_SW(&p, K1, 0x3000, K0);
+
+	/* Exception handler is installed @ offset 0x2000 */
+	uasm_i_addiu(&p, K0, K0, 0x2000);
+	/* Jump to the function */
+	uasm_i_jr(&p, K0);
+	 uasm_i_nop(&p);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_exit() - Assemble common guest exit handler.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the generic guest exit handling code. This is called by the
+ * exception vectors (generated by kvm_mips_build_exception()), and calls
+ * kvm_mips_handle_exit(), then either resumes the guest or returns to the host
+ * depending on the return value.
+ *
+ * Returns:	Next address after end of written function.
+ */
+void *kvm_mips_build_exit(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+	struct uasm_label labels[3];
+	struct uasm_reloc relocs[3];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/*
+	 * Generic Guest exception handler. We end up here when the guest
+	 * does something that causes a trap to kernel mode.
+	 */
+
+	/* Get the VCPU pointer from DDATA_LO */
+	uasm_i_mfc0(&p, K1, C0_DDATA_LO);
+	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+	/* Start saving Guest context to VCPU */
+	for (i = 0; i < 32; ++i) {
+		/* Guest k0/k1 saved later */
+		if (i == K0 || i == K1)
+			continue;
+		UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+	}
+
+	/* We need to save hi/lo and restore them on the way out */
+	uasm_i_mfhi(&p, T0);
+	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1);
+
+	uasm_i_mflo(&p, T0);
+	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
+
+	/* Finally save guest k0/k1 to VCPU */
+	uasm_i_mfc0(&p, T0, C0_ERROREPC);
+	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+
+	/* Get GUEST k1 and save it in VCPU */
+	uasm_i_addiu(&p, T1, ZERO, ~0x2ff);
+	uasm_i_mfc0(&p, T0, C0_EBASE);
+	uasm_i_and(&p, T0, T0, T1);
+	UASM_i_LW(&p, T0, 0x3000, T0);
+	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+	/* Now that context has been saved, we can use other registers */
+
+	/* Restore vcpu */
+	uasm_i_mfc0(&p, A1, C0_DDATA_LO);
+	uasm_i_move(&p, S1, A1);
+
+	/* Restore run (vcpu->run) */
+	UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
+	/* Save pointer to run in s0, will be saved by the compiler */
+	uasm_i_move(&p, S0, A0);
+
+	/*
+	 * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
+	 * the exception
+	 */
+	uasm_i_mfc0(&p, K0, C0_EPC);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1);
+
+	uasm_i_mfc0(&p, K0, C0_BADVADDR);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr),
+		  K1);
+
+	uasm_i_mfc0(&p, K0, C0_CAUSE);
+	uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
+
+	/* Now restore the host state just enough to run the handlers */
+
+	/* Switch EBASE to the one used by Linux */
+	/* load up the host EBASE */
+	uasm_i_mfc0(&p, V0, C0_STATUS);
+
+	uasm_i_lui(&p, AT, ST0_BEV >> 16);
+	uasm_i_or(&p, K0, V0, AT);
+
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	UASM_i_LA_mostly(&p, K0, (long)&ebase);
+	UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
+	uasm_i_mtc0(&p, K0, C0_EBASE);
+
+	/*
+	 * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
+	 * trigger FPE for pending exceptions.
+	 */
+	uasm_i_lui(&p, AT, ST0_CU1 >> 16);
+	uasm_i_and(&p, V1, V0, AT);
+	uasm_il_beqz(&p, &r, V1, label_fpu_1);
+	 uasm_i_nop(&p);
+	uasm_i_cfc1(&p, T0, 31);
+	uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31), K1);
+	uasm_i_ctc1(&p, ZERO, 31);
+	uasm_l_fpu_1(&l, p);
+
+#ifdef CONFIG_CPU_HAS_MSA
+	/*
+	 * If MSA is enabled, save MSACSR and clear it so that later
+	 * instructions don't trigger MSAFPE for pending exceptions.
+	 */
+	uasm_i_mfc0(&p, T0, C0_CONFIG3);
+	uasm_i_ext(&p, T0, T0, 28, 1); /* MIPS_CONF3_MSAP */
+	uasm_il_beqz(&p, &r, T0, label_msa_1);
+	 uasm_i_nop(&p);
+	uasm_i_mfc0(&p, T0, C0_CONFIG5);
+	uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
+	uasm_il_beqz(&p, &r, T0, label_msa_1);
+	 uasm_i_nop(&p);
+	uasm_i_cfcmsa(&p, T0, MSA_CSR);
+	uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
+		  K1);
+	uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
+	uasm_l_msa_1(&l, p);
+#endif
+
+	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+	uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
+	uasm_i_and(&p, V0, V0, AT);
+	uasm_i_lui(&p, AT, ST0_CU0 >> 16);
+	uasm_i_or(&p, V0, V0, AT);
+	uasm_i_mtc0(&p, V0, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	/* Load up host GP */
+	UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+	/* Need a stack before we can jump to "C" */
+	UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+	/* Saved host state */
+	uasm_i_addiu(&p, SP, SP, -(int)sizeof(struct pt_regs));
+
+	/*
+	 * XXXKYMA do we need to load the host ASID, maybe not because the
+	 * kernel entries are marked GLOBAL, need to verify
+	 */
+
+	/* Restore host DDATA_LO */
+	UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), SP);
+	uasm_i_mtc0(&p, K0, C0_DDATA_LO);
+
+	/* Restore RDHWR access */
+	UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+	uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+	uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+	/* Jump to handler */
+	/*
+	 * XXXKYMA: not sure if this is safe, how large is the stack??
+	 * Now jump to the kvm_mips_handle_exit() to see if we can deal
+	 * with this in the kernel
+	 */
+	UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
+	uasm_i_jalr(&p, RA, T9);
+	 uasm_i_addiu(&p, SP, SP, -CALLFRAME_SIZ);
+
+	uasm_resolve_relocs(relocs, labels);
+
+	p = kvm_mips_build_ret_from_exit(p);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to handle the return from kvm_mips_handle_exit(), either
+ * resuming the guest or returning to the host depending on the return value.
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_from_exit(void *addr)
+{
+	u32 *p = addr;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
+
+	/* Return from handler Make sure interrupts are disabled */
+	uasm_i_di(&p, ZERO);
+	uasm_i_ehb(&p);
+
+	/*
+	 * XXXKYMA: k0/k1 could have been blown away if we processed
+	 * an exception while we were handling the exception from the
+	 * guest, reload k1
+	 */
+
+	uasm_i_move(&p, K1, S1);
+	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+	/*
+	 * Check return value, should tell us if we are returning to the
+	 * host (handle I/O etc)or resuming the guest
+	 */
+	uasm_i_andi(&p, T0, V0, RESUME_HOST);
+	uasm_il_bnez(&p, &r, T0, label_return_to_host);
+	 uasm_i_nop(&p);
+
+	p = kvm_mips_build_ret_to_guest(p);
+
+	uasm_l_return_to_host(&l, p);
+	p = kvm_mips_build_ret_to_host(p);
+
+	uasm_resolve_relocs(relocs, labels);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the guest.
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_guest(void *addr)
+{
+	u32 *p = addr;
+
+	/* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
+	uasm_i_mtc0(&p, S1, C0_DDATA_LO);
+
+	/* Load up the Guest EBASE to minimize the window where BEV is set */
+	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+
+	/* Switch EBASE back to the one used by KVM */
+	uasm_i_mfc0(&p, V1, C0_STATUS);
+	uasm_i_lui(&p, AT, ST0_BEV >> 16);
+	uasm_i_or(&p, K0, V1, AT);
+	uasm_i_mtc0(&p, K0, C0_STATUS);
+	uasm_i_ehb(&p);
+	uasm_i_mtc0(&p, T0, C0_EBASE);
+
+	/* Setup status register for running guest in UM */
+	uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE);
+	UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX));
+	uasm_i_and(&p, V1, V1, AT);
+	uasm_i_mtc0(&p, V1, C0_STATUS);
+	uasm_i_ehb(&p);
+
+	p = kvm_mips_build_enter_guest(p);
+
+	return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_host() - Assemble code to return to the host.
+ * @addr:	Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run
+ * function generated by kvm_mips_build_vcpu_run().
+ *
+ * Returns:	Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_host(void *addr)
+{
+	u32 *p = addr;
+	unsigned int i;
+
+	/* EBASE is already pointing to Linux */
+	UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+	uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs));
+
+	/* Restore host DDATA_LO */
+	UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), K1);
+	uasm_i_mtc0(&p, K0, C0_DDATA_LO);
+
+	/*
+	 * r2/v0 is the return code, shift it down by 2 (arithmetic)
+	 * to recover the err code
+	 */
+	uasm_i_sra(&p, K0, V0, 2);
+	uasm_i_move(&p, V0, K0);
+
+	/* Load context saved on the host stack */
+	for (i = 16; i < 31; ++i) {
+		if (i == 24)
+			i = 28;
+		UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+	}
+
+	UASM_i_LW(&p, K0, offsetof(struct pt_regs, hi), K1);
+	uasm_i_mthi(&p, K0);
+
+	UASM_i_LW(&p, K0, offsetof(struct pt_regs, lo), K1);
+	uasm_i_mtlo(&p, K0);
+
+	/* Restore RDHWR access */
+	UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+	uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+	uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+	/* Restore RA, which is the address we will return to */
+	UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1);
+	uasm_i_jr(&p, RA);
+	 uasm_i_nop(&p);
+
+	return p;
+}
+
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index d661c100b2198e..fb118a2c8379f8 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -28,10 +28,6 @@
 #define MIPS_EXC_MAX                12
 /* XXXSL More to follow */
 
-extern char __kvm_mips_vcpu_run_end[];
-extern char mips32_exception[], mips32_exceptionEnd[];
-extern char mips32_GuestException[], mips32_GuestExceptionEnd[];
-
 #define C_TI        (_ULCAST_(1) << 30)
 
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
deleted file mode 100644
index 698286c0f7323f..00000000000000
--- a/arch/mips/kvm/locore.S
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Main entry point for the guest, exception handling.
- *
- * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
- * Authors: Sanjay Lal <sanjayl@kymasys.com>
- */
-
-#include <asm/asm.h>
-#include <asm/asmmacro.h>
-#include <asm/regdef.h>
-#include <asm/mipsregs.h>
-#include <asm/stackframe.h>
-#include <asm/asm-offsets.h>
-
-#define _C_LABEL(x)     x
-#define MIPSX(name)     mips32_ ## name
-#define CALLFRAME_SIZ   32
-
-/*
- * VECTOR
- *  exception vector entrypoint
- */
-#define VECTOR(x, regmask)      \
-    .ent    _C_LABEL(x),0;      \
-    EXPORT(x);
-
-#define VECTOR_END(x)      \
-    EXPORT(x);
-
-/* Overload, Danger Will Robinson!! */
-#define PT_HOST_USERLOCAL   PT_EPC
-
-#define CP0_DDATA_LO        $28,3
-
-/* Resume Flags */
-#define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
-
-#define RESUME_GUEST            0
-#define RESUME_HOST             RESUME_FLAG_HOST
-
-/*
- * __kvm_mips_vcpu_run: entry point to the guest
- * a0: run
- * a1: vcpu
- */
-	.set	noreorder
-
-FEXPORT(__kvm_mips_vcpu_run)
-	/* k0/k1 not being used in host kernel context */
-	INT_ADDIU k1, sp, -PT_SIZE
-	LONG_S	$16, PT_R16(k1)
-	LONG_S	$17, PT_R17(k1)
-	LONG_S	$18, PT_R18(k1)
-	LONG_S	$19, PT_R19(k1)
-	LONG_S	$20, PT_R20(k1)
-	LONG_S	$21, PT_R21(k1)
-	LONG_S	$22, PT_R22(k1)
-	LONG_S	$23, PT_R23(k1)
-
-	LONG_S	$28, PT_R28(k1)
-	LONG_S	$29, PT_R29(k1)
-	LONG_S	$30, PT_R30(k1)
-	LONG_S	$31, PT_R31(k1)
-
-	/* Save hi/lo */
-	mflo	v0
-	LONG_S	v0, PT_LO(k1)
-	mfhi	v1
-	LONG_S	v1, PT_HI(k1)
-
-	/* Save host status */
-	mfc0	v0, CP0_STATUS
-	LONG_S	v0, PT_STATUS(k1)
-
-	/* Save DDATA_LO, will be used to store pointer to vcpu */
-	mfc0	v1, CP0_DDATA_LO
-	LONG_S	v1, PT_HOST_USERLOCAL(k1)
-
-	/* DDATA_LO has pointer to vcpu */
-	mtc0	a1, CP0_DDATA_LO
-
-	/* Offset into vcpu->arch */
-	INT_ADDIU k1, a1, VCPU_HOST_ARCH
-
-	/*
-	 * Save the host stack to VCPU, used for exception processing
-	 * when we exit from the Guest
-	 */
-	LONG_S	sp, VCPU_HOST_STACK(k1)
-
-	/* Save the kernel gp as well */
-	LONG_S	gp, VCPU_HOST_GP(k1)
-
-	/*
-	 * Setup status register for running the guest in UM, interrupts
-	 * are disabled
-	 */
-	li	k0, (ST0_EXL | KSU_USER | ST0_BEV)
-	mtc0	k0, CP0_STATUS
-	ehb
-
-	/* load up the new EBASE */
-	LONG_L	k0, VCPU_GUEST_EBASE(k1)
-	mtc0	k0, CP0_EBASE
-
-	/*
-	 * Now that the new EBASE has been loaded, unset BEV, set
-	 * interrupt mask as it was but make sure that timer interrupts
-	 * are enabled
-	 */
-	li	k0, (ST0_EXL | KSU_USER | ST0_IE)
-	andi	v0, v0, ST0_IM
-	or	k0, k0, v0
-	mtc0	k0, CP0_STATUS
-	ehb
-
-	/* Set Guest EPC */
-	LONG_L	t0, VCPU_PC(k1)
-	mtc0	t0, CP0_EPC
-
-FEXPORT(__kvm_mips_load_asid)
-	/* Set the ASID for the Guest Kernel */
-	PTR_L	t0, VCPU_COP0(k1)
-	LONG_L	t0, COP0_STATUS(t0)
-	andi	t0, KSU_USER | ST0_ERL | ST0_EXL
-	xori	t0, KSU_USER
-	bnez	t0, 1f		/* If kernel */
-	 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-	INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-	/* t1: contains the base of the ASID array, need to get the cpu id */
-	LONG_L	t2, TI_CPU($28)             /* smp_processor_id */
-	INT_SLL	t2, t2, 2                   /* x4 */
-	REG_ADDU t3, t1, t2
-	LONG_L	k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	li	t3, CPUINFO_SIZE/4
-	mul	t2, t2, t3		/* x sizeof(struct cpuinfo_mips)/4 */
-	LONG_L	t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-	and	k0, k0, t2
-#else
-	andi	k0, k0, MIPS_ENTRYHI_ASID
-#endif
-	mtc0	k0, CP0_ENTRYHI
-	ehb
-
-	/* Disable RDHWR access */
-	mtc0	zero, CP0_HWRENA
-
-	.set	noat
-	/* Now load up the Guest Context from VCPU */
-	LONG_L	$1, VCPU_R1(k1)
-	LONG_L	$2, VCPU_R2(k1)
-	LONG_L	$3, VCPU_R3(k1)
-
-	LONG_L	$4, VCPU_R4(k1)
-	LONG_L	$5, VCPU_R5(k1)
-	LONG_L	$6, VCPU_R6(k1)
-	LONG_L	$7, VCPU_R7(k1)
-
-	LONG_L	$8, VCPU_R8(k1)
-	LONG_L	$9, VCPU_R9(k1)
-	LONG_L	$10, VCPU_R10(k1)
-	LONG_L	$11, VCPU_R11(k1)
-	LONG_L	$12, VCPU_R12(k1)
-	LONG_L	$13, VCPU_R13(k1)
-	LONG_L	$14, VCPU_R14(k1)
-	LONG_L	$15, VCPU_R15(k1)
-	LONG_L	$16, VCPU_R16(k1)
-	LONG_L	$17, VCPU_R17(k1)
-	LONG_L	$18, VCPU_R18(k1)
-	LONG_L	$19, VCPU_R19(k1)
-	LONG_L	$20, VCPU_R20(k1)
-	LONG_L	$21, VCPU_R21(k1)
-	LONG_L	$22, VCPU_R22(k1)
-	LONG_L	$23, VCPU_R23(k1)
-	LONG_L	$24, VCPU_R24(k1)
-	LONG_L	$25, VCPU_R25(k1)
-
-	/* k0/k1 loaded up later */
-
-	LONG_L	$28, VCPU_R28(k1)
-	LONG_L	$29, VCPU_R29(k1)
-	LONG_L	$30, VCPU_R30(k1)
-	LONG_L	$31, VCPU_R31(k1)
-
-	/* Restore hi/lo */
-	LONG_L	k0, VCPU_LO(k1)
-	mtlo	k0
-
-	LONG_L	k0, VCPU_HI(k1)
-	mthi	k0
-
-FEXPORT(__kvm_mips_load_k0k1)
-	/* Restore the guest's k0/k1 registers */
-	LONG_L	k0, VCPU_R26(k1)
-	LONG_L	k1, VCPU_R27(k1)
-
-	/* Jump to guest */
-	eret
-EXPORT(__kvm_mips_vcpu_run_end)
-
-VECTOR(MIPSX(exception), unknown)
-/* Find out what mode we came from and jump to the proper handler. */
-	mtc0	k0, CP0_ERROREPC	#01: Save guest k0
-	ehb				#02:
-
-	mfc0	k0, CP0_EBASE		#02: Get EBASE
-	INT_SRL	k0, k0, 10		#03: Get rid of CPUNum
-	INT_SLL	k0, k0, 10		#04
-	LONG_S	k1, 0x3000(k0)		#05: Save k1 @ offset 0x3000
-	INT_ADDIU k0, k0, 0x2000	#06: Exception handler is
-					#    installed @ offset 0x2000
-	j	k0			#07: jump to the function
-	 nop				#08: branch delay slot
-VECTOR_END(MIPSX(exceptionEnd))
-.end MIPSX(exception)
-
-/*
- * Generic Guest exception handler. We end up here when the guest
- * does something that causes a trap to kernel mode.
- */
-NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-	/* Get the VCPU pointer from DDTATA_LO */
-	mfc0	k1, CP0_DDATA_LO
-	INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-	/* Start saving Guest context to VCPU */
-	LONG_S	$0, VCPU_R0(k1)
-	LONG_S	$1, VCPU_R1(k1)
-	LONG_S	$2, VCPU_R2(k1)
-	LONG_S	$3, VCPU_R3(k1)
-	LONG_S	$4, VCPU_R4(k1)
-	LONG_S	$5, VCPU_R5(k1)
-	LONG_S	$6, VCPU_R6(k1)
-	LONG_S	$7, VCPU_R7(k1)
-	LONG_S	$8, VCPU_R8(k1)
-	LONG_S	$9, VCPU_R9(k1)
-	LONG_S	$10, VCPU_R10(k1)
-	LONG_S	$11, VCPU_R11(k1)
-	LONG_S	$12, VCPU_R12(k1)
-	LONG_S	$13, VCPU_R13(k1)
-	LONG_S	$14, VCPU_R14(k1)
-	LONG_S	$15, VCPU_R15(k1)
-	LONG_S	$16, VCPU_R16(k1)
-	LONG_S	$17, VCPU_R17(k1)
-	LONG_S	$18, VCPU_R18(k1)
-	LONG_S	$19, VCPU_R19(k1)
-	LONG_S	$20, VCPU_R20(k1)
-	LONG_S	$21, VCPU_R21(k1)
-	LONG_S	$22, VCPU_R22(k1)
-	LONG_S	$23, VCPU_R23(k1)
-	LONG_S	$24, VCPU_R24(k1)
-	LONG_S	$25, VCPU_R25(k1)
-
-	/* Guest k0/k1 saved later */
-
-	LONG_S	$28, VCPU_R28(k1)
-	LONG_S	$29, VCPU_R29(k1)
-	LONG_S	$30, VCPU_R30(k1)
-	LONG_S	$31, VCPU_R31(k1)
-
-	.set at
-
-	/* We need to save hi/lo and restore them on the way out */
-	mfhi	t0
-	LONG_S	t0, VCPU_HI(k1)
-
-	mflo	t0
-	LONG_S	t0, VCPU_LO(k1)
-
-	/* Finally save guest k0/k1 to VCPU */
-	mfc0	t0, CP0_ERROREPC
-	LONG_S	t0, VCPU_R26(k1)
-
-	/* Get GUEST k1 and save it in VCPU */
-	PTR_LI	t1, ~0x2ff
-	mfc0	t0, CP0_EBASE
-	and	t0, t0, t1
-	LONG_L	t0, 0x3000(t0)
-	LONG_S	t0, VCPU_R27(k1)
-
-	/* Now that context has been saved, we can use other registers */
-
-	/* Restore vcpu */
-	mfc0	a1, CP0_DDATA_LO
-	move	s1, a1
-
-	/* Restore run (vcpu->run) */
-	LONG_L	a0, VCPU_RUN(a1)
-	/* Save pointer to run in s0, will be saved by the compiler */
-	move	s0, a0
-
-	/*
-	 * Save Host level EPC, BadVaddr and Cause to VCPU, useful to
-	 * process the exception
-	 */
-	mfc0	k0,CP0_EPC
-	LONG_S	k0, VCPU_PC(k1)
-
-	mfc0	k0, CP0_BADVADDR
-	LONG_S	k0, VCPU_HOST_CP0_BADVADDR(k1)
-
-	mfc0	k0, CP0_CAUSE
-	sw	k0, VCPU_HOST_CP0_CAUSE(k1)
-
-	/* Now restore the host state just enough to run the handlers */
-
-	/* Switch EBASE to the one used by Linux */
-	/* load up the host EBASE */
-	mfc0	v0, CP0_STATUS
-
-	or	k0, v0, ST0_BEV
-
-	mtc0	k0, CP0_STATUS
-	ehb
-
-	LONG_L	k0, ebase
-	mtc0	k0,CP0_EBASE
-
-	/*
-	 * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
-	 * trigger FPE for pending exceptions.
-	 */
-	and	v1, v0, ST0_CU1
-	beqz	v1, 1f
-	 nop
-	.set	push
-	SET_HARDFLOAT
-	cfc1	t0, fcr31
-	sw	t0, VCPU_FCR31(k1)
-	ctc1	zero,fcr31
-	.set	pop
-1:
-
-#ifdef CONFIG_CPU_HAS_MSA
-	/*
-	 * If MSA is enabled, save MSACSR and clear it so that later
-	 * instructions don't trigger MSAFPE for pending exceptions.
-	 */
-	mfc0	t0, CP0_CONFIG3
-	ext	t0, t0, 28, 1 /* MIPS_CONF3_MSAP */
-	beqz	t0, 1f
-	 nop
-	mfc0	t0, CP0_CONFIG5
-	ext	t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */
-	beqz	t0, 1f
-	 nop
-	_cfcmsa	t0, MSA_CSR
-	sw	t0, VCPU_MSA_CSR(k1)
-	_ctcmsa	MSA_CSR, zero
-1:
-#endif
-
-	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-	and	v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-	or	v0, v0, ST0_CU0
-	mtc0	v0, CP0_STATUS
-	ehb
-
-	/* Load up host GP */
-	LONG_L	gp, VCPU_HOST_GP(k1)
-
-	/* Need a stack before we can jump to "C" */
-	LONG_L	sp, VCPU_HOST_STACK(k1)
-
-	/* Saved host state */
-	INT_ADDIU sp, sp, -PT_SIZE
-
-	/*
-	 * XXXKYMA do we need to load the host ASID, maybe not because the
-	 * kernel entries are marked GLOBAL, need to verify
-	 */
-
-	/* Restore host DDATA_LO */
-	LONG_L	k0, PT_HOST_USERLOCAL(sp)
-	mtc0	k0, CP0_DDATA_LO
-
-	/* Restore RDHWR access */
-	INT_L	k0, hwrena
-	mtc0	k0, CP0_HWRENA
-
-	/* Jump to handler */
-FEXPORT(__kvm_mips_jump_to_handler)
-	/*
-	 * XXXKYMA: not sure if this is safe, how large is the stack??
-	 * Now jump to the kvm_mips_handle_exit() to see if we can deal
-	 * with this in the kernel
-	 */
-	PTR_LA	t9, kvm_mips_handle_exit
-	jalr.hb	t9
-	 INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */
-
-	/* Return from handler Make sure interrupts are disabled */
-	di
-	ehb
-
-	/*
-	 * XXXKYMA: k0/k1 could have been blown away if we processed
-	 * an exception while we were handling the exception from the
-	 * guest, reload k1
-	 */
-
-	move	k1, s1
-	INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
-	/*
-	 * Check return value, should tell us if we are returning to the
-	 * host (handle I/O etc)or resuming the guest
-	 */
-	andi	t0, v0, RESUME_HOST
-	bnez	t0, __kvm_mips_return_to_host
-	 nop
-
-__kvm_mips_return_to_guest:
-	/* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-	mtc0	s1, CP0_DDATA_LO
-
-	/* Load up the Guest EBASE to minimize the window where BEV is set */
-	LONG_L	t0, VCPU_GUEST_EBASE(k1)
-
-	/* Switch EBASE back to the one used by KVM */
-	mfc0	v1, CP0_STATUS
-	or	k0, v1, ST0_BEV
-	mtc0	k0, CP0_STATUS
-	ehb
-	mtc0	t0, CP0_EBASE
-
-	/* Setup status register for running guest in UM */
-	or	v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-	and	v1, v1, ~(ST0_CU0 | ST0_MX)
-	mtc0	v1, CP0_STATUS
-	ehb
-
-	/* Set Guest EPC */
-	LONG_L	t0, VCPU_PC(k1)
-	mtc0	t0, CP0_EPC
-
-	/* Set the ASID for the Guest Kernel */
-	PTR_L	t0, VCPU_COP0(k1)
-	LONG_L	t0, COP0_STATUS(t0)
-	andi	t0, KSU_USER | ST0_ERL | ST0_EXL
-	xori	t0, KSU_USER
-	bnez	t0, 1f		/* If kernel */
-	 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-	INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
-1:
-	/* t1: contains the base of the ASID array, need to get the cpu id  */
-	LONG_L	t2, TI_CPU($28)		/* smp_processor_id */
-	INT_SLL	t2, t2, 2		/* x4 */
-	REG_ADDU t3, t1, t2
-	LONG_L	k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	li	t3, CPUINFO_SIZE/4
-	mul	t2, t2, t3		/* x sizeof(struct cpuinfo_mips)/4 */
-	LONG_L	t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
-	and	k0, k0, t2
-#else
-	andi	k0, k0, MIPS_ENTRYHI_ASID
-#endif
-	mtc0	k0, CP0_ENTRYHI
-	ehb
-
-	/* Disable RDHWR access */
-	mtc0	zero, CP0_HWRENA
-
-	.set	noat
-	/* load the guest context from VCPU and return */
-	LONG_L	$0, VCPU_R0(k1)
-	LONG_L	$1, VCPU_R1(k1)
-	LONG_L	$2, VCPU_R2(k1)
-	LONG_L	$3, VCPU_R3(k1)
-	LONG_L	$4, VCPU_R4(k1)
-	LONG_L	$5, VCPU_R5(k1)
-	LONG_L	$6, VCPU_R6(k1)
-	LONG_L	$7, VCPU_R7(k1)
-	LONG_L	$8, VCPU_R8(k1)
-	LONG_L	$9, VCPU_R9(k1)
-	LONG_L	$10, VCPU_R10(k1)
-	LONG_L	$11, VCPU_R11(k1)
-	LONG_L	$12, VCPU_R12(k1)
-	LONG_L	$13, VCPU_R13(k1)
-	LONG_L	$14, VCPU_R14(k1)
-	LONG_L	$15, VCPU_R15(k1)
-	LONG_L	$16, VCPU_R16(k1)
-	LONG_L	$17, VCPU_R17(k1)
-	LONG_L	$18, VCPU_R18(k1)
-	LONG_L	$19, VCPU_R19(k1)
-	LONG_L	$20, VCPU_R20(k1)
-	LONG_L	$21, VCPU_R21(k1)
-	LONG_L	$22, VCPU_R22(k1)
-	LONG_L	$23, VCPU_R23(k1)
-	LONG_L	$24, VCPU_R24(k1)
-	LONG_L	$25, VCPU_R25(k1)
-
-	/* $/k1 loaded later */
-	LONG_L	$28, VCPU_R28(k1)
-	LONG_L	$29, VCPU_R29(k1)
-	LONG_L	$30, VCPU_R30(k1)
-	LONG_L	$31, VCPU_R31(k1)
-
-FEXPORT(__kvm_mips_skip_guest_restore)
-	LONG_L	k0, VCPU_HI(k1)
-	mthi	k0
-
-	LONG_L	k0, VCPU_LO(k1)
-	mtlo	k0
-
-	LONG_L	k0, VCPU_R26(k1)
-	LONG_L	k1, VCPU_R27(k1)
-
-	eret
-	.set	at
-
-__kvm_mips_return_to_host:
-	/* EBASE is already pointing to Linux */
-	LONG_L	k1, VCPU_HOST_STACK(k1)
-	INT_ADDIU k1,k1, -PT_SIZE
-
-	/* Restore host DDATA_LO */
-	LONG_L	k0, PT_HOST_USERLOCAL(k1)
-	mtc0	k0, CP0_DDATA_LO
-
-	/*
-	 * r2/v0 is the return code, shift it down by 2 (arithmetic)
-	 * to recover the err code
-	 */
-	INT_SRA	k0, v0, 2
-	move	$2, k0
-
-	/* Load context saved on the host stack */
-	LONG_L	$16, PT_R16(k1)
-	LONG_L	$17, PT_R17(k1)
-	LONG_L	$18, PT_R18(k1)
-	LONG_L	$19, PT_R19(k1)
-	LONG_L	$20, PT_R20(k1)
-	LONG_L	$21, PT_R21(k1)
-	LONG_L	$22, PT_R22(k1)
-	LONG_L	$23, PT_R23(k1)
-
-	LONG_L	$28, PT_R28(k1)
-	LONG_L	$29, PT_R29(k1)
-	LONG_L	$30, PT_R30(k1)
-
-	LONG_L	k0, PT_HI(k1)
-	mthi	k0
-
-	LONG_L	k0, PT_LO(k1)
-	mtlo	k0
-
-	/* Restore RDHWR access */
-	INT_L	k0, hwrena
-	mtc0	k0, CP0_HWRENA
-
-	/* Restore RA, which is the address we will return to */
-	LONG_L	ra, PT_R31(k1)
-	j	ra
-	 nop
-
-VECTOR_END(MIPSX(GuestExceptionEnd))
-.end MIPSX(GuestException)
-
-MIPSX(exceptions):
-	####
-	##### The exception handlers.
-	#####
-	.word _C_LABEL(MIPSX(GuestException))	#  0
-	.word _C_LABEL(MIPSX(GuestException))	#  1
-	.word _C_LABEL(MIPSX(GuestException))	#  2
-	.word _C_LABEL(MIPSX(GuestException))	#  3
-	.word _C_LABEL(MIPSX(GuestException))	#  4
-	.word _C_LABEL(MIPSX(GuestException))	#  5
-	.word _C_LABEL(MIPSX(GuestException))	#  6
-	.word _C_LABEL(MIPSX(GuestException))	#  7
-	.word _C_LABEL(MIPSX(GuestException))	#  8
-	.word _C_LABEL(MIPSX(GuestException))	#  9
-	.word _C_LABEL(MIPSX(GuestException))	# 10
-	.word _C_LABEL(MIPSX(GuestException))	# 11
-	.word _C_LABEL(MIPSX(GuestException))	# 12
-	.word _C_LABEL(MIPSX(GuestException))	# 13
-	.word _C_LABEL(MIPSX(GuestException))	# 14
-	.word _C_LABEL(MIPSX(GuestException))	# 15
-	.word _C_LABEL(MIPSX(GuestException))	# 16
-	.word _C_LABEL(MIPSX(GuestException))	# 17
-	.word _C_LABEL(MIPSX(GuestException))	# 18
-	.word _C_LABEL(MIPSX(GuestException))	# 19
-	.word _C_LABEL(MIPSX(GuestException))	# 20
-	.word _C_LABEL(MIPSX(GuestException))	# 21
-	.word _C_LABEL(MIPSX(GuestException))	# 22
-	.word _C_LABEL(MIPSX(GuestException))	# 23
-	.word _C_LABEL(MIPSX(GuestException))	# 24
-	.word _C_LABEL(MIPSX(GuestException))	# 25
-	.word _C_LABEL(MIPSX(GuestException))	# 26
-	.word _C_LABEL(MIPSX(GuestException))	# 27
-	.word _C_LABEL(MIPSX(GuestException))	# 28
-	.word _C_LABEL(MIPSX(GuestException))	# 29
-	.word _C_LABEL(MIPSX(GuestException))	# 30
-	.word _C_LABEL(MIPSX(GuestException))	# 31
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 5f1163653b5062..e3ae1229f147ed 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -247,8 +247,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	int err, size, offset;
-	void *gebase;
+	int err, size;
+	void *gebase, *p;
 	int i;
 
 	struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -286,41 +286,28 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* Save new ebase */
 	vcpu->arch.guest_ebase = gebase;
 
-	/* Copy L1 Guest Exception handler to correct offset */
+	/* Build guest exception vectors dynamically in unmapped memory */
 
 	/* TLB Refill, EXL = 0 */
-	memcpy(gebase, mips32_exception,
-	       mips32_exceptionEnd - mips32_exception);
+	kvm_mips_build_exception(gebase);
 
 	/* General Exception Entry point */
-	memcpy(gebase + 0x180, mips32_exception,
-	       mips32_exceptionEnd - mips32_exception);
+	kvm_mips_build_exception(gebase + 0x180);
 
 	/* For vectored interrupts poke the exception code @ all offsets 0-7 */
 	for (i = 0; i < 8; i++) {
 		kvm_debug("L1 Vectored handler @ %p\n",
 			  gebase + 0x200 + (i * VECTORSPACING));
-		memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception,
-		       mips32_exceptionEnd - mips32_exception);
+		kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING);
 	}
 
-	/* General handler, relocate to unmapped space for sanity's sake */
-	offset = 0x2000;
-	kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n",
-		  gebase + offset,
-		  mips32_GuestExceptionEnd - mips32_GuestException);
+	/* General exit handler */
+	p = gebase + 0x2000;
+	p = kvm_mips_build_exit(p);
 
-	memcpy(gebase + offset, mips32_GuestException,
-	       mips32_GuestExceptionEnd - mips32_GuestException);
-
-#ifdef MODULE
-	offset += mips32_GuestExceptionEnd - mips32_GuestException;
-	memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run,
-	       __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run);
-	vcpu->arch.vcpu_run = gebase + offset;
-#else
-	vcpu->arch.vcpu_run = __kvm_mips_vcpu_run;
-#endif
+	/* Guest entry routine */
+	vcpu->arch.vcpu_run = p;
+	p = kvm_mips_build_vcpu_run(p);
 
 	/* Invalidate the icache for these ranges */
 	local_flush_icache_range((unsigned long)gebase,

From d7b8f890b63f386ca28c820b8ddb7ff1f63cbe3c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:40 +0100
Subject: [PATCH 213/302] MIPS: KVM: Add dumping of generated entry code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dump the generated entry code with pr_debug(), similar to how it is done
in tlbex.c, so it can be more easily debugged.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index e3ae1229f147ed..9f36dcb3c58030 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -245,6 +245,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	}
 }
 
+static inline void dump_handler(const char *symbol, void *start, void *end)
+{
+	u32 *p;
+
+	pr_debug("LEAF(%s)\n", symbol);
+
+	pr_debug("\t.set push\n");
+	pr_debug("\t.set noreorder\n");
+
+	for (p = start; p < (u32 *)end; ++p)
+		pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p);
+
+	pr_debug("\t.set\tpop\n");
+
+	pr_debug("\tEND(%s)\n", symbol);
+}
+
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	int err, size;
@@ -309,6 +326,14 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.vcpu_run = p;
 	p = kvm_mips_build_vcpu_run(p);
 
+	/* Dump the generated code */
+	pr_debug("#include <asm/asm.h>\n");
+	pr_debug("#include <asm/regdef.h>\n");
+	pr_debug("\n");
+	dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+	dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
+	dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
+
 	/* Invalidate the icache for these ranges */
 	local_flush_icache_range((unsigned long)gebase,
 				(unsigned long)gebase + ALIGN(size, PAGE_SIZE));

From 9c9886584086f33b6f709d284360c6ad6bcd01c4 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:41 +0100
Subject: [PATCH 214/302] MIPS: KVM: Drop now unused asm offsets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that locore.S is converted to uasm, remove a bunch of the assembly
offset definitions created by asm-offsets.c, including the CPUINFO_ ones
for reading the variable asid mask, and the non FPU/MSA related VCPU_
definitions. KVM's fpu.S and msa.S still use the remaining definitions.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kernel/asm-offsets.c | 66 ----------------------------------
 1 file changed, 66 deletions(-)

diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index a1263d188a5a8c..fae2f9447792a0 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -339,67 +339,9 @@ void output_pm_defines(void)
 }
 #endif
 
-void output_cpuinfo_defines(void)
-{
-	COMMENT(" MIPS cpuinfo offsets. ");
-	DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips));
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
-	OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask);
-#endif
-}
-
 void output_kvm_defines(void)
 {
 	COMMENT(" KVM/MIPS Specfic offsets. ");
-	DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch));
-	OFFSET(VCPU_RUN, kvm_vcpu, run);
-	OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
-
-	OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
-
-	OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
-	OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp);
-
-	OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
-	OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
-	OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
-
-	OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
-	OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
-	OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);
-	OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]);
-	OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]);
-	OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]);
-	OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]);
-	OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]);
-	OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]);
-	OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]);
-	OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]);
-	OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]);
-	OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]);
-	OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]);
-	OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]);
-	OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]);
-	OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]);
-	OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]);
-	OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]);
-	OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]);
-	OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]);
-	OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]);
-	OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]);
-	OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]);
-	OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]);
-	OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]);
-	OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]);
-	OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]);
-	OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]);
-	OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]);
-	OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]);
-	OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]);
-	OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
-	OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
-	OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
-	BLANK();
 
 	OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
 	OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
@@ -437,14 +379,6 @@ void output_kvm_defines(void)
 	OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31);
 	OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr);
 	BLANK();
-
-	OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
-	OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
-	OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
-
-	OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]);
-	OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]);
-	BLANK();
 }
 
 #ifdef CONFIG_MIPS_CPS

From d37f4038d16273087bdc60387807b90a8c06da7f Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:42 +0100
Subject: [PATCH 215/302] MIPS: KVM: Omit FPU handling entry code if possible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The FPU handling code on entry from guest is unnecessary if no FPU is
present, so allow it to be dropped at uasm assembly time.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index 9a18b4939b3539..c0d9f551c1c18f 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -393,18 +393,21 @@ void *kvm_mips_build_exit(void *addr)
 	UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
 	uasm_i_mtc0(&p, K0, C0_EBASE);
 
-	/*
-	 * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
-	 * trigger FPE for pending exceptions.
-	 */
-	uasm_i_lui(&p, AT, ST0_CU1 >> 16);
-	uasm_i_and(&p, V1, V0, AT);
-	uasm_il_beqz(&p, &r, V1, label_fpu_1);
-	 uasm_i_nop(&p);
-	uasm_i_cfc1(&p, T0, 31);
-	uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31), K1);
-	uasm_i_ctc1(&p, ZERO, 31);
-	uasm_l_fpu_1(&l, p);
+	if (raw_cpu_has_fpu) {
+		/*
+		 * If FPU is enabled, save FCR31 and clear it so that later
+		 * ctc1's don't trigger FPE for pending exceptions.
+		 */
+		uasm_i_lui(&p, AT, ST0_CU1 >> 16);
+		uasm_i_and(&p, V1, V0, AT);
+		uasm_il_beqz(&p, &r, V1, label_fpu_1);
+		 uasm_i_nop(&p);
+		uasm_i_cfc1(&p, T0, 31);
+		uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31),
+			  K1);
+		uasm_i_ctc1(&p, ZERO, 31);
+		uasm_l_fpu_1(&l, p);
+	}
 
 #ifdef CONFIG_CPU_HAS_MSA
 	/*

From 38ea7a715d43752e1c53d5a0c3cbab5e321f22f7 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:43 +0100
Subject: [PATCH 216/302] MIPS: KVM: Check MSA presence at uasm time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check for presence of MSA at uasm assembly time rather than at runtime
in the generated KVM host entry code. This optimises the guest exit path
by eliminating the MSA code entirely if not present, and eliminating the
read of Config3.MSAP and conditional branch if MSA is present.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index c0d9f551c1c18f..53e1e576d18a48 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -55,7 +55,6 @@
 #define C0_CAUSE	13, 0
 #define C0_EPC		14, 0
 #define C0_EBASE	15, 1
-#define C0_CONFIG3	16, 3
 #define C0_CONFIG5	16, 5
 #define C0_DDATA_LO	28, 3
 #define C0_ERROREPC	30, 0
@@ -409,25 +408,21 @@ void *kvm_mips_build_exit(void *addr)
 		uasm_l_fpu_1(&l, p);
 	}
 
-#ifdef CONFIG_CPU_HAS_MSA
-	/*
-	 * If MSA is enabled, save MSACSR and clear it so that later
-	 * instructions don't trigger MSAFPE for pending exceptions.
-	 */
-	uasm_i_mfc0(&p, T0, C0_CONFIG3);
-	uasm_i_ext(&p, T0, T0, 28, 1); /* MIPS_CONF3_MSAP */
-	uasm_il_beqz(&p, &r, T0, label_msa_1);
-	 uasm_i_nop(&p);
-	uasm_i_mfc0(&p, T0, C0_CONFIG5);
-	uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
-	uasm_il_beqz(&p, &r, T0, label_msa_1);
-	 uasm_i_nop(&p);
-	uasm_i_cfcmsa(&p, T0, MSA_CSR);
-	uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
-		  K1);
-	uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
-	uasm_l_msa_1(&l, p);
-#endif
+	if (cpu_has_msa) {
+		/*
+		 * If MSA is enabled, save MSACSR and clear it so that later
+		 * instructions don't trigger MSAFPE for pending exceptions.
+		 */
+		uasm_i_mfc0(&p, T0, C0_CONFIG5);
+		uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
+		uasm_il_beqz(&p, &r, T0, label_msa_1);
+		 uasm_i_nop(&p);
+		uasm_i_cfcmsa(&p, T0, MSA_CSR);
+		uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
+			  K1);
+		uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
+		uasm_l_msa_1(&l, p);
+	}
 
 	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
 	uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));

From 025014e3fb8f6afab92d3050c3423e2b1ffcbc84 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:44 +0100
Subject: [PATCH 217/302] MIPS: KVM: Drop redundant restore of DDATA_LO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On return from the exit handler to the host (without re-entering the
guest) we restore the saved value of the DDATA_LO register which we use
as a scratch register. However we've already restored it ready for
calling the exit handler so there is no need to do it again, so drop
that code.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index 53e1e576d18a48..6395bfa7e54260 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -581,10 +581,6 @@ static void *kvm_mips_build_ret_to_host(void *addr)
 	UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
 	uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs));
 
-	/* Restore host DDATA_LO */
-	UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), K1);
-	uasm_i_mtc0(&p, K0, C0_DDATA_LO);
-
 	/*
 	 * r2/v0 is the return code, shift it down by 2 (arithmetic)
 	 * to recover the err code

From 1e5217f54251ddd339e00a0b30f126589737d467 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:45 +0100
Subject: [PATCH 218/302] MIPS: KVM: Dynamically choose scratch registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scratch cop0 registers are needed by KVM to be able to save/restore all
the GPRs, including k0/k1, and for storing the VCPU pointer. However no
registers are universally suitable for these purposes, so the decision
should be made at runtime.

Until now, we've used DDATA_LO to store the VCPU pointer, and ErrorEPC
as a temporary. It could be argued that this is abuse of those
registers, and DDATA_LO is known not to be usable on certain
implementations (Cavium Octeon). If KScratch registers are present, use
them instead.

We save & restore the temporary register in addition to the VCPU pointer
register when using a KScratch register for it, as it may be used for
normal host TLB handling too.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |  1 +
 arch/mips/kvm/entry.c            | 94 +++++++++++++++++++++++++++-----
 arch/mips/kvm/mips.c             |  4 ++
 3 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 2e76e899079c1d..a80c3208b23493 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -536,6 +536,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
 extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
 
 /* Building of entry/exception code */
+int kvm_mips_entry_setup(void);
 void *kvm_mips_build_vcpu_run(void *addr);
 void *kvm_mips_build_exception(void *addr);
 void *kvm_mips_build_exit(void *addr);
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index 6395bfa7e54260..b6e7fd9f12f0ec 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -61,6 +61,9 @@
 
 #define CALLFRAME_SIZ   32
 
+static unsigned int scratch_vcpu[2] = { C0_DDATA_LO };
+static unsigned int scratch_tmp[2] = { C0_ERROREPC };
+
 enum label_id {
 	label_fpu_1 = 1,
 	label_msa_1,
@@ -78,6 +81,69 @@ static void *kvm_mips_build_ret_from_exit(void *addr);
 static void *kvm_mips_build_ret_to_guest(void *addr);
 static void *kvm_mips_build_ret_to_host(void *addr);
 
+/**
+ * kvm_mips_entry_setup() - Perform global setup for entry code.
+ *
+ * Perform global setup for entry code, such as choosing a scratch register.
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+int kvm_mips_entry_setup(void)
+{
+	/*
+	 * We prefer to use KScratchN registers if they are available over the
+	 * defaults above, which may not work on all cores.
+	 */
+	unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+
+	/* Pick a scratch register for storing VCPU */
+	if (kscratch_mask) {
+		scratch_vcpu[0] = 31;
+		scratch_vcpu[1] = ffs(kscratch_mask) - 1;
+		kscratch_mask &= ~BIT(scratch_vcpu[1]);
+	}
+
+	/* Pick a scratch register to use as a temp for saving state */
+	if (kscratch_mask) {
+		scratch_tmp[0] = 31;
+		scratch_tmp[1] = ffs(kscratch_mask) - 1;
+		kscratch_mask &= ~BIT(scratch_tmp[1]);
+	}
+
+	return 0;
+}
+
+static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
+					unsigned int frame)
+{
+	/* Save the VCPU scratch register value in cp0_epc of the stack frame */
+	uasm_i_mfc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+
+	/* Save the temp scratch register value in cp0_cause of stack frame */
+	if (scratch_tmp[0] == 31) {
+		uasm_i_mfc0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+		UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+	}
+}
+
+static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
+					   unsigned int frame)
+{
+	/*
+	 * Restore host scratch register values saved by
+	 * kvm_mips_build_save_scratch().
+	 */
+	UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+	uasm_i_mtc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+
+	if (scratch_tmp[0] == 31) {
+		UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+		uasm_i_mtc0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+	}
+}
+
 /**
  * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
  * @addr:	Address to start writing code.
@@ -120,12 +186,11 @@ void *kvm_mips_build_vcpu_run(void *addr)
 	uasm_i_mfc0(&p, V0, C0_STATUS);
 	UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
 
-	/* Save DDATA_LO, will be used to store pointer to vcpu */
-	uasm_i_mfc0(&p, V1, C0_DDATA_LO);
-	UASM_i_SW(&p, V1, offsetof(struct pt_regs, cp0_epc), K1);
+	/* Save scratch registers, will be used to store pointer to vcpu etc */
+	kvm_mips_build_save_scratch(&p, V1, K1);
 
-	/* DDATA_LO has pointer to vcpu */
-	uasm_i_mtc0(&p, A1, C0_DDATA_LO);
+	/* VCPU scratch register has pointer to vcpu */
+	uasm_i_mtc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
 
 	/* Offset into vcpu->arch */
 	uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
@@ -273,7 +338,7 @@ void *kvm_mips_build_exception(void *addr)
 	u32 *p = addr;
 
 	/* Save guest k0 */
-	uasm_i_mtc0(&p, K0, C0_ERROREPC);
+	uasm_i_mtc0(&p, K0, scratch_tmp[0], scratch_tmp[1]);
 	uasm_i_ehb(&p);
 
 	/* Get EBASE */
@@ -321,8 +386,8 @@ void *kvm_mips_build_exit(void *addr)
 	 * does something that causes a trap to kernel mode.
 	 */
 
-	/* Get the VCPU pointer from DDATA_LO */
-	uasm_i_mfc0(&p, K1, C0_DDATA_LO);
+	/* Get the VCPU pointer from the scratch register */
+	uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
 	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
 
 	/* Start saving Guest context to VCPU */
@@ -341,7 +406,7 @@ void *kvm_mips_build_exit(void *addr)
 	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
 
 	/* Finally save guest k0/k1 to VCPU */
-	uasm_i_mfc0(&p, T0, C0_ERROREPC);
+	uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
 	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
 
 	/* Get GUEST k1 and save it in VCPU */
@@ -354,7 +419,7 @@ void *kvm_mips_build_exit(void *addr)
 	/* Now that context has been saved, we can use other registers */
 
 	/* Restore vcpu */
-	uasm_i_mfc0(&p, A1, C0_DDATA_LO);
+	uasm_i_mfc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
 	uasm_i_move(&p, S1, A1);
 
 	/* Restore run (vcpu->run) */
@@ -446,9 +511,8 @@ void *kvm_mips_build_exit(void *addr)
 	 * kernel entries are marked GLOBAL, need to verify
 	 */
 
-	/* Restore host DDATA_LO */
-	UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), SP);
-	uasm_i_mtc0(&p, K0, C0_DDATA_LO);
+	/* Restore host scratch registers, as we'll have clobbered them */
+	kvm_mips_build_restore_scratch(&p, K0, SP);
 
 	/* Restore RDHWR access */
 	UASM_i_LA_mostly(&p, K0, (long)&hwrena);
@@ -536,8 +600,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr)
 {
 	u32 *p = addr;
 
-	/* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-	uasm_i_mtc0(&p, S1, C0_DDATA_LO);
+	/* Put the saved pointer to vcpu (s1) back into the scratch register */
+	uasm_i_mtc0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
 
 	/* Load up the Guest EBASE to minimize the window where BEV is set */
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9f36dcb3c58030..26cc0b93c56542 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1775,6 +1775,10 @@ static int __init kvm_mips_init(void)
 {
 	int ret;
 
+	ret = kvm_mips_entry_setup();
+	if (ret)
+		return ret;
+
 	ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
 
 	if (ret)

From 1f9ca62cbc5f4d1663a0f0d193156ce9dc6ed452 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:46 +0100
Subject: [PATCH 219/302] MIPS: KVM: Relative branch to common exit handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use a relative branch to get from the individual exception vectors to
the common guest exit handler, rather than loading the address of the
exit handler and jumping to it.

This is made easier due to the fact we are now generating the entry code
dynamically. This will also allow the exception code to be further
reduced in future patches.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h |  2 +-
 arch/mips/kvm/entry.c            | 23 +++++++++++++++++------
 arch/mips/kvm/mips.c             | 12 +++++++-----
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index a80c3208b23493..b32785543787bc 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -538,7 +538,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
 /* Building of entry/exception code */
 int kvm_mips_entry_setup(void);
 void *kvm_mips_build_vcpu_run(void *addr);
-void *kvm_mips_build_exception(void *addr);
+void *kvm_mips_build_exception(void *addr, void *handler);
 void *kvm_mips_build_exit(void *addr);
 
 /* FPU/MSA context management */
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index b6e7fd9f12f0ec..fb2cbf653474eb 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -69,12 +69,14 @@ enum label_id {
 	label_msa_1,
 	label_return_to_host,
 	label_kernel_asid,
+	label_exit_common,
 };
 
 UASM_L_LA(_fpu_1)
 UASM_L_LA(_msa_1)
 UASM_L_LA(_return_to_host)
 UASM_L_LA(_kernel_asid)
+UASM_L_LA(_exit_common)
 
 static void *kvm_mips_build_enter_guest(void *addr);
 static void *kvm_mips_build_ret_from_exit(void *addr);
@@ -327,15 +329,23 @@ static void *kvm_mips_build_enter_guest(void *addr)
 /**
  * kvm_mips_build_exception() - Assemble first level guest exception handler.
  * @addr:	Address to start writing code.
+ * @handler:	Address of common handler (within range of @addr).
  *
  * Assemble exception vector code for guest execution. The generated vector will
- * jump to the common exception handler generated by kvm_mips_build_exit().
+ * branch to the common exception handler generated by kvm_mips_build_exit().
  *
  * Returns:	Next address after end of written function.
  */
-void *kvm_mips_build_exception(void *addr)
+void *kvm_mips_build_exception(void *addr, void *handler)
 {
 	u32 *p = addr;
+	struct uasm_label labels[2];
+	struct uasm_reloc relocs[2];
+	struct uasm_label *l = labels;
+	struct uasm_reloc *r = relocs;
+
+	memset(labels, 0, sizeof(labels));
+	memset(relocs, 0, sizeof(relocs));
 
 	/* Save guest k0 */
 	uasm_i_mtc0(&p, K0, scratch_tmp[0], scratch_tmp[1]);
@@ -349,12 +359,13 @@ void *kvm_mips_build_exception(void *addr)
 	/* Save k1 @ offset 0x3000 */
 	UASM_i_SW(&p, K1, 0x3000, K0);
 
-	/* Exception handler is installed @ offset 0x2000 */
-	uasm_i_addiu(&p, K0, K0, 0x2000);
-	/* Jump to the function */
-	uasm_i_jr(&p, K0);
+	/* Branch to the common handler */
+	uasm_il_b(&p, &r, label_exit_common);
 	 uasm_i_nop(&p);
 
+	uasm_l_exit_common(&l, handler);
+	uasm_resolve_relocs(relocs, labels);
+
 	return p;
 }
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 26cc0b93c56542..7c76768ff364ed 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -265,7 +265,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	int err, size;
-	void *gebase, *p;
+	void *gebase, *p, *handler;
 	int i;
 
 	struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -304,22 +304,24 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.guest_ebase = gebase;
 
 	/* Build guest exception vectors dynamically in unmapped memory */
+	handler = gebase + 0x2000;
 
 	/* TLB Refill, EXL = 0 */
-	kvm_mips_build_exception(gebase);
+	kvm_mips_build_exception(gebase, handler);
 
 	/* General Exception Entry point */
-	kvm_mips_build_exception(gebase + 0x180);
+	kvm_mips_build_exception(gebase + 0x180, handler);
 
 	/* For vectored interrupts poke the exception code @ all offsets 0-7 */
 	for (i = 0; i < 8; i++) {
 		kvm_debug("L1 Vectored handler @ %p\n",
 			  gebase + 0x200 + (i * VECTORSPACING));
-		kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING);
+		kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING,
+					 handler);
 	}
 
 	/* General exit handler */
-	p = gebase + 0x2000;
+	p = handler;
 	p = kvm_mips_build_exit(p);
 
 	/* Guest entry routine */

From eadfb501a5f5522b3df1b06ed8ffbb063d19d827 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 23 Jun 2016 17:34:47 +0100
Subject: [PATCH 220/302] MIPS: KVM: Save k0 straight into VCPU structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently on a guest exception the guest's k0 register is saved to the
scratch temp register and the guest k1 saved to the exception base
address + 0x3000 using k0 to extract the Exception Base field of the
EBase register and as the base operand to the store. Both are then
copied into the VCPU structure after the other general purpose registers
have been saved there.

This bouncing to exception base + 0x3000 is not actually necessary as
the VCPU pointer can be determined and written through just as easily
with only a single spare register. The VCPU pointer is already needed in
k1 for saving the other GP registers, so lets save the guest k0 register
straight into the VCPU structure through k1, first saving k1 into the
scratch temp register instead of k0.

This could potentially pave the way for having a single exception base
area for use by all guests.

The ehb after saving the k register to the scratch temp register is also
delayed until just before it needs to be read back.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index fb2cbf653474eb..de8b6ec5573f43 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -347,17 +347,15 @@ void *kvm_mips_build_exception(void *addr, void *handler)
 	memset(labels, 0, sizeof(labels));
 	memset(relocs, 0, sizeof(relocs));
 
-	/* Save guest k0 */
-	uasm_i_mtc0(&p, K0, scratch_tmp[0], scratch_tmp[1]);
-	uasm_i_ehb(&p);
+	/* Save guest k1 into scratch register */
+	uasm_i_mtc0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
 
-	/* Get EBASE */
-	uasm_i_mfc0(&p, K0, C0_EBASE);
-	/* Get rid of CPUNum */
-	uasm_i_srl(&p, K0, K0, 10);
-	uasm_i_sll(&p, K0, K0, 10);
-	/* Save k1 @ offset 0x3000 */
-	UASM_i_SW(&p, K1, 0x3000, K0);
+	/* Get the VCPU pointer from the VCPU scratch register */
+	uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+	/* Save guest k0 into VCPU structure */
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
 
 	/* Branch to the common handler */
 	uasm_il_b(&p, &r, label_exit_common);
@@ -395,12 +393,13 @@ void *kvm_mips_build_exit(void *addr)
 	/*
 	 * Generic Guest exception handler. We end up here when the guest
 	 * does something that causes a trap to kernel mode.
+	 *
+	 * Both k0/k1 registers will have already been saved (k0 into the vcpu
+	 * structure, and k1 into the scratch_tmp register).
+	 *
+	 * The k1 register will already contain the kvm_vcpu_arch pointer.
 	 */
 
-	/* Get the VCPU pointer from the scratch register */
-	uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
-	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
-
 	/* Start saving Guest context to VCPU */
 	for (i = 0; i < 32; ++i) {
 		/* Guest k0/k1 saved later */
@@ -416,15 +415,9 @@ void *kvm_mips_build_exit(void *addr)
 	uasm_i_mflo(&p, T0);
 	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
 
-	/* Finally save guest k0/k1 to VCPU */
+	/* Finally save guest k1 to VCPU */
+	uasm_i_ehb(&p);
 	uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
-	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
-
-	/* Get GUEST k1 and save it in VCPU */
-	uasm_i_addiu(&p, T1, ZERO, ~0x2ff);
-	uasm_i_mfc0(&p, T0, C0_EBASE);
-	uasm_i_and(&p, T0, T0, T1);
-	UASM_i_LW(&p, T0, 0x3000, T0);
 	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
 
 	/* Now that context has been saved, we can use other registers */

From 1c66b79bb3b11942a98085fd89295cf6cddae41a Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:07 +0100
Subject: [PATCH 221/302] MIPS: inst.h: Rename b{eq,ne}zcji[al]c_op to
 pop{6,7}6_op

The opcodes currently defined in inst.h as beqzcjic_op & bnezcjialc_op
are actually defined in the MIPS base instruction set manuals as pop66 &
pop76 respectively. Rename them as such, for consistency with the
documentation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/uapi/asm/inst.h | 4 ++--
 arch/mips/kernel/branch.c         | 4 ++--
 arch/mips/math-emu/cp1emu.c       | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index fc96012c75d1fc..3fc00e7b33c466 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -32,9 +32,9 @@ enum major_op {
 	sb_op, sh_op, swl_op, sw_op,
 	sdl_op, sdr_op, swr_op, cache_op,
 	ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op,
-	lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op,
+	lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op,
 	sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op,
-	scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op
+	scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op
 };
 
 /*
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 6dc3f1fdaccc36..fb9ed96d785819 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		epc += 4 + (insn.i_format.simmediate << 2);
 		regs->cp0_epc = epc;
 		break;
-	case beqzcjic_op:
+	case pop66_op:
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
 			break;
@@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		/* Compact branch: BEQZC || JIC */
 		regs->cp0_epc += 8;
 		break;
-	case bnezcjialc_op:
+	case pop76_op:
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
 			break;
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index d96e912b9d44fa..1bbf16581f195e 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case beqzcjic_op:
+	case pop66_op:
 		if (!cpu_has_mips_r6)
 			break;
 		*contpc = regs->cp0_epc + dec_insn.pc_inc +
 			dec_insn.next_pc_inc;
 
 		return 1;
-	case bnezcjialc_op:
+	case pop76_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (!insn.i_format.rs)

From 1b492600068d5fbd033196ce2bdb28735a23747e Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:08 +0100
Subject: [PATCH 222/302] MIPS: inst.h: Rename cbcond{0,1}_op to pop{1,3}0_op

The opcodes currently defined in inst.h as cbcond0_op & cbcond1_op are
actually defined in the MIPS base instruction set manuals as pop10 &
pop30 respectively. Rename them as such, for consistency with the
documentation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/uapi/asm/inst.h | 4 ++--
 arch/mips/kernel/branch.c         | 4 ++--
 arch/mips/math-emu/cp1emu.c       | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 3fc00e7b33c466..77429d1622b343 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -21,11 +21,11 @@
 enum major_op {
 	spec_op, bcond_op, j_op, jal_op,
 	beq_op, bne_op, blez_op, bgtz_op,
-	addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op,
+	addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op,
 	andi_op, ori_op, xori_op, lui_op,
 	cop0_op, cop1_op, cop2_op, cop1x_op,
 	beql_op, bnel_op, blezl_op, bgtzl_op,
-	daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op,
+	daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op,
 	spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op,
 	lb_op, lh_op, lwl_op, lw_op,
 	lbu_op, lhu_op, lwr_op, lwu_op,
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index fb9ed96d785819..46c227fc98f5af 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
 		regs->cp0_epc += 8;
 		break;
 #endif
-	case cbcond0_op:
-	case cbcond1_op:
+	case pop10_op:
+	case pop30_op:
 		/* Only valid for MIPS R6 */
 		if (!cpu_has_mips_r6) {
 			ret = -SIGILL;
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 1bbf16581f195e..6dc07fba187fa8 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
 				dec_insn.pc_inc +
 				dec_insn.next_pc_inc;
 		return 1;
-	case cbcond0_op:
-	case cbcond1_op:
+	case pop10_op:
+	case pop30_op:
 		if (!cpu_has_mips_r6)
 			break;
 		if (insn.i_format.rt && !insn.i_format.rs)

From d14740fed8f12beb59d3087df985097c008e868e Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:09 +0100
Subject: [PATCH 223/302] MIPS: KVM: Fix fpu.S misassembly with r6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__kvm_save_fpu and __kvm_restore_fpu use .set mips64r2 so that they can
access the odd FPU registers as well as the even, however this causes
misassembly of the return instruction on MIPSr6.

Fix by replacing .set mips64r2 with .set fp=64, which doesn't change the
architecture revision.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/fpu.S | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S
index 531fbf5131c02b..16f17c6390dd00 100644
--- a/arch/mips/kvm/fpu.S
+++ b/arch/mips/kvm/fpu.S
@@ -14,13 +14,16 @@
 #include <asm/mipsregs.h>
 #include <asm/regdef.h>
 
+/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */
+#undef fp
+
 	.set	noreorder
 	.set	noat
 
 LEAF(__kvm_save_fpu)
 	.set	push
-	.set	mips64r2
 	SET_HARDFLOAT
+	.set	fp=64
 	mfc0	t0, CP0_STATUS
 	sll     t0, t0, 5			# is Status.FR set?
 	bgez    t0, 1f				# no: skip odd doubles
@@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu)
 
 LEAF(__kvm_restore_fpu)
 	.set	push
-	.set	mips64r2
 	SET_HARDFLOAT
+	.set	fp=64
 	mfc0	t0, CP0_STATUS
 	sll     t0, t0, 5			# is Status.FR set?
 	bgez    t0, 1f				# no: skip odd doubles

From d85ebff0073c783f0c74dc0e08c348f6f2d807c7 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:10 +0100
Subject: [PATCH 224/302] MIPS: KVM: Fix pre-r6 ll/sc instructions on r6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The atomic KVM register access macros in kvm_host.h (for the guest Cause
register with KVM in trap & emulate mode) use ll/sc instructions,
however they still .set mips3, which causes pre-MIPSr6 instruction
encodings to be emitted, even for a MIPSr6 build.

Fix it to use MIPS_ISA_ARCH_LEVEL as other parts of arch/mips already
do.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b32785543787bc..b54bcadd8aecab 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -400,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
 	unsigned long temp;
 	do {
 		__asm__ __volatile__(
-		"	.set	mips3				\n"
+		"	.set	"MIPS_ISA_ARCH_LEVEL"		\n"
 		"	" __LL "%0, %1				\n"
 		"	or	%0, %2				\n"
 		"	" __SC	"%0, %1				\n"
@@ -416,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg,
 	unsigned long temp;
 	do {
 		__asm__ __volatile__(
-		"	.set	mips3				\n"
+		"	.set	"MIPS_ISA_ARCH_LEVEL"		\n"
 		"	" __LL "%0, %1				\n"
 		"	and	%0, %2				\n"
 		"	" __SC	"%0, %1				\n"
@@ -433,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
 	unsigned long temp;
 	do {
 		__asm__ __volatile__(
-		"	.set	mips3				\n"
+		"	.set	"MIPS_ISA_ARCH_LEVEL"		\n"
 		"	" __LL "%0, %1				\n"
 		"	and	%0, %2				\n"
 		"	or	%0, %3				\n"

From 70e92c7ee94094d2db8bfe225a8c9b1bde89c26d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:11 +0100
Subject: [PATCH 225/302] MIPS: KVM: Don't save/restore lo/hi for r6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MIPSr6 doesn't have lo/hi registers, so don't bother saving or
restoring them, and don't expose them to userland with the KVM ioctl
interface either.

In fact the lo/hi registers aren't callee saved in the MIPS ABIs anyway,
so there is no need to preserve the host lo/hi values at all when
transitioning to and from the guest (which happens via a function call).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 16 ++++------------
 arch/mips/kvm/mips.c  |  6 ++++++
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index de8b6ec5573f43..75ba7c2ecb3d00 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -178,12 +178,6 @@ void *kvm_mips_build_vcpu_run(void *addr)
 		UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
 	}
 
-	/* Save hi/lo */
-	uasm_i_mflo(&p, V0);
-	UASM_i_SW(&p, V0, offsetof(struct pt_regs, lo), K1);
-	uasm_i_mfhi(&p, V1);
-	UASM_i_SW(&p, V1, offsetof(struct pt_regs, hi), K1);
-
 	/* Save host status */
 	uasm_i_mfc0(&p, V0, C0_STATUS);
 	UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
@@ -307,12 +301,14 @@ static void *kvm_mips_build_enter_guest(void *addr)
 		UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
 	}
 
+#ifndef CONFIG_CPU_MIPSR6
 	/* Restore hi/lo */
 	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1);
 	uasm_i_mthi(&p, K0);
 
 	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1);
 	uasm_i_mtlo(&p, K0);
+#endif
 
 	/* Restore the guest's k0/k1 registers */
 	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
@@ -408,12 +404,14 @@ void *kvm_mips_build_exit(void *addr)
 		UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
 	}
 
+#ifndef CONFIG_CPU_MIPSR6
 	/* We need to save hi/lo and restore them on the way out */
 	uasm_i_mfhi(&p, T0);
 	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1);
 
 	uasm_i_mflo(&p, T0);
 	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
+#endif
 
 	/* Finally save guest k1 to VCPU */
 	uasm_i_ehb(&p);
@@ -663,12 +661,6 @@ static void *kvm_mips_build_ret_to_host(void *addr)
 		UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
 	}
 
-	UASM_i_LW(&p, K0, offsetof(struct pt_regs, hi), K1);
-	uasm_i_mthi(&p, K0);
-
-	UASM_i_LW(&p, K0, offsetof(struct pt_regs, lo), K1);
-	uasm_i_mtlo(&p, K0);
-
 	/* Restore RDHWR access */
 	UASM_i_LA_mostly(&p, K0, (long)&hwrena);
 	uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7c76768ff364ed..414b00074e296a 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -521,8 +521,10 @@ static u64 kvm_mips_get_one_regs[] = {
 	KVM_REG_MIPS_R30,
 	KVM_REG_MIPS_R31,
 
+#ifndef CONFIG_CPU_MIPSR6
 	KVM_REG_MIPS_HI,
 	KVM_REG_MIPS_LO,
+#endif
 	KVM_REG_MIPS_PC,
 
 	KVM_REG_MIPS_CP0_INDEX,
@@ -666,12 +668,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31:
 		v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0];
 		break;
+#ifndef CONFIG_CPU_MIPSR6
 	case KVM_REG_MIPS_HI:
 		v = (long)vcpu->arch.hi;
 		break;
 	case KVM_REG_MIPS_LO:
 		v = (long)vcpu->arch.lo;
 		break;
+#endif
 	case KVM_REG_MIPS_PC:
 		v = (long)vcpu->arch.pc;
 		break;
@@ -887,12 +891,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31:
 		vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v;
 		break;
+#ifndef CONFIG_CPU_MIPSR6
 	case KVM_REG_MIPS_HI:
 		vcpu->arch.hi = v;
 		break;
 	case KVM_REG_MIPS_LO:
 		vcpu->arch.lo = v;
 		break;
+#endif
 	case KVM_REG_MIPS_PC:
 		vcpu->arch.pc = v;
 		break;

From 2e0badfaac234ef3ed6b5397ff208d218cd450fb Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:12 +0100
Subject: [PATCH 226/302] MIPS: KVM: Support r6 compact branch emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support in KVM for emulation of instructions in the forbidden slot
of MIPSr6 compact branches. If we hit an exception on the forbidden
slot, then the branch must not have been taken, which makes calculation
of the resume PC trivial.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 52 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 5f0354c80c8eb9..f0fa9e95605696 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		nextpc = epc;
 		break;
 
-	case blez_op:		/* not really i_format */
-	case blezl_op:
-		/* rt field assumed to be zero */
+	case blez_op:	/* POP06 */
+#ifndef CONFIG_CPU_MIPSR6
+	case blezl_op:	/* removed in R6 */
+#endif
+		if (insn.i_format.rt != 0)
+			goto compact_branch;
 		if ((long)arch->gprs[insn.i_format.rs] <= 0)
 			epc = epc + 4 + (insn.i_format.simmediate << 2);
 		else
@@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 		nextpc = epc;
 		break;
 
-	case bgtz_op:
-	case bgtzl_op:
-		/* rt field assumed to be zero */
+	case bgtz_op:	/* POP07 */
+#ifndef CONFIG_CPU_MIPSR6
+	case bgtzl_op:	/* removed in R6 */
+#endif
+		if (insn.i_format.rt != 0)
+			goto compact_branch;
 		if ((long)arch->gprs[insn.i_format.rs] > 0)
 			epc = epc + 4 + (insn.i_format.simmediate << 2);
 		else
@@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
 	case cop1_op:
 		kvm_err("%s: unsupported cop1_op\n", __func__);
 		break;
+
+#ifdef CONFIG_CPU_MIPSR6
+	/* R6 added the following compact branches with forbidden slots */
+	case blezl_op:	/* POP26 */
+	case bgtzl_op:	/* POP27 */
+		/* only rt == 0 isn't compact branch */
+		if (insn.i_format.rt != 0)
+			goto compact_branch;
+		break;
+	case pop10_op:
+	case pop30_op:
+		/* only rs == rt == 0 is reserved, rest are compact branches */
+		if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
+			goto compact_branch;
+		break;
+	case pop66_op:
+	case pop76_op:
+		/* only rs == 0 isn't compact branch */
+		if (insn.i_format.rs != 0)
+			goto compact_branch;
+		break;
+compact_branch:
+		/*
+		 * If we've hit an exception on the forbidden slot, then
+		 * the branch must not have been taken.
+		 */
+		epc += 8;
+		nextpc = epc;
+		break;
+#else
+compact_branch:
+		/* Compact branches not supported before R6 */
+		break;
+#endif
 	}
 
 	return nextpc;

From 5cc4aafced42d7ece3d20650bf6ca2a165e6fca3 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:13 +0100
Subject: [PATCH 227/302] MIPS: KVM: Recognise r6 CACHE encoding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recognise the new MIPSr6 CACHE instruction encoding rather than the
pre-r6 one when an r6 kernel is being built. A SPECIAL3 opcode is used
and the immediate field is reduced to 9 bits wide since MIPSr6.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/dyntrans.c |  5 ++++-
 arch/mips/kvm/emulate.c  | 21 ++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 8a1833b9eb384d..91ebd2b6034f67 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -72,7 +72,10 @@ int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
 	synci_inst.i_format.opcode = bcond_op;
 	synci_inst.i_format.rs = inst.i_format.rs;
 	synci_inst.i_format.rt = synci_op;
-	synci_inst.i_format.simmediate = inst.i_format.simmediate;
+	if (cpu_has_mips_r6)
+		synci_inst.i_format.simmediate = inst.spec3_format.simmediate;
+	else
+		synci_inst.i_format.simmediate = inst.i_format.simmediate;
 
 	return kvm_mips_trans_replace(vcpu, opc, synci_inst);
 }
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index f0fa9e95605696..62e6a7b313aea5 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1601,7 +1601,10 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 
 	base = inst.i_format.rs;
 	op_inst = inst.i_format.rt;
-	offset = inst.i_format.simmediate;
+	if (cpu_has_mips_r6)
+		offset = inst.spec3_format.simmediate;
+	else
+		offset = inst.i_format.simmediate;
 	cache = op_inst & CacheOp_Cache;
 	op = op_inst & CacheOp_Op;
 
@@ -1764,11 +1767,27 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 		er = kvm_mips_emulate_load(inst, cause, run, vcpu);
 		break;
 
+#ifndef CONFIG_CPU_MIPSR6
 	case cache_op:
 		++vcpu->stat.cache_exits;
 		trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
 		er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
 		break;
+#else
+	case spec3_op:
+		switch (inst.spec3_format.func) {
+		case cache6_op:
+			++vcpu->stat.cache_exits;
+			trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+			er = kvm_mips_emulate_cache(inst, opc, cause, run,
+						    vcpu);
+			break;
+		default:
+			goto unknown;
+		};
+		break;
+unknown:
+#endif
 
 	default:
 		kvm_err("Instruction emulation not supported (%p/%#x)\n", opc,

From 8eeab81c3d55ba41ae68888b13a1a34893104e12 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:14 +0100
Subject: [PATCH 228/302] MIPS: KVM: Decode RDHWR more strictly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When KVM emulates the RDHWR instruction, decode the instruction more
strictly. The rs field (bits 25:21) should be zero, as should bits 10:9.
Bits 8:6 is the register select field in MIPSr6, so we aren't strict
about those bits (no other operations should use that encoding space).

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 62e6a7b313aea5..be18dfe9ecaa21 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -2357,7 +2357,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 	}
 
 	if (inst.r_format.opcode == spec3_op &&
-	    inst.r_format.func == rdhwr_op) {
+	    inst.r_format.func == rdhwr_op &&
+	    inst.r_format.rs == 0 &&
+	    (inst.r_format.re >> 3) == 0) {
 		int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
 		int rd = inst.r_format.rd;
 		int rt = inst.r_format.rt;

From 8426097258c8092f8f3f7a5c420d3809e99b0769 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 4 Jul 2016 19:35:15 +0100
Subject: [PATCH 229/302] MIPS: KVM: Emulate generic QEMU machine on r6 T&E
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Default the guest PRId register to represent a generic QEMU machine
instead of a 24kc on MIPSr6. 24kc isn't supported by r6 Linux kernels.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/trap_emul.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 00e8dc3d36cb1c..091553942bcbca 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -431,9 +431,15 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	/*
 	 * Arch specific stuff, set up config registers properly so that the
-	 * guest will come up as expected, for now we simulate a MIPS 24kc
+	 * guest will come up as expected
 	 */
+#ifndef CONFIG_CPU_MIPSR6
+	/* r2-r5, simulate a MIPS 24kc */
 	kvm_write_c0_guest_prid(cop0, 0x00019300);
+#else
+	/* r6+, simulate a generic QEMU machine */
+	kvm_write_c0_guest_prid(cop0, 0x00010000);
+#endif
 	/*
 	 * Have config1, Cacheable, noncoherent, write-back, write allocate.
 	 * Endianness, arch revision & virtually tagged icache should match

From 1c17c3e6bfe61846d1120291b2f486d00bc0d18f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 8 Jul 2016 11:53:38 +0200
Subject: [PATCH 230/302] KVM: VMX: reflect broken preemption timer in
 vmcs_config

Simplify cpu_has_vmx_preemption_timer.  This is consistent with the
rest of setup_vmcs_config and preparatory for the next patch.

Tested-by: Wanpeng Li <kernellwp@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e564fa2c7ac881..00ce07e6f2cabd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1121,9 +1121,6 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void)
 
 static inline bool cpu_has_vmx_preemption_timer(void)
 {
-	if (cpu_has_broken_vmx_preemption_timer())
-		return false;
-
 	return vmcs_config.pin_based_exec_ctrl &
 		PIN_BASED_VMX_PREEMPTION_TIMER;
 }
@@ -3407,6 +3404,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_pin_based_exec_control) < 0)
 		return -EIO;
 
+	if (cpu_has_broken_vmx_preemption_timer())
+		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 	if (!(_cpu_based_2nd_exec_control &
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
 		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;

From 55123e3c862d98dc4fbcade38d158c32c022afd8 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 6 Jul 2016 18:29:58 +0800
Subject: [PATCH 231/302] KVM: nVMX: avoid incorrect preemption timer vmexit in
 nested guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The preemption timer for nested VMX is emulated by hrtimer which is started on L2
entry, stopped on L2 exit and evaluated via the check_nested_events hook. However,
nested_vmx_exit_handled is always returning true for preemption timer vmexit.  Then,
the L1 preemption timer vmexit is captured and be treated as a L2 preemption
timer vmexit, causing NULL pointer dereferences or worse in the L1 guest's
vmexit handler:

    BUG: unable to handle kernel NULL pointer dereference at           (null)
    IP: [<          (null)>]           (null)
    PGD 0
    Oops: 0010 [#1] SMP
    Call Trace:
     ? kvm_lapic_expired_hv_timer+0x47/0x90 [kvm]
     handle_preemption_timer+0xe/0x20 [kvm_intel]
     vmx_handle_exit+0x169/0x15a0 [kvm_intel]
     ? kvm_arch_vcpu_ioctl_run+0xd5d/0x19d0 [kvm]
     kvm_arch_vcpu_ioctl_run+0xdee/0x19d0 [kvm]
     ? kvm_arch_vcpu_ioctl_run+0xd5d/0x19d0 [kvm]
     ? vcpu_load+0x1c/0x60 [kvm]
     ? kvm_arch_vcpu_load+0x57/0x260 [kvm]
     kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm]
     do_vfs_ioctl+0x96/0x6a0
     ? __fget_light+0x2a/0x90
     SyS_ioctl+0x79/0x90
     do_syscall_64+0x68/0x180
     entry_SYSCALL64_slow_path+0x25/0x25
    Code:  Bad RIP value.
    RIP  [<          (null)>]           (null)
     RSP <ffff8800b5263c48>
    CR2: 0000000000000000
    ---[ end trace 9c70c48b1a2bc66e ]---

This can be reproduced readily by preemption timer enabled on L0 and disabled
on L1.

Return false since preemption timer vmexits must never be reflected to L2.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Yunhong Jiang <yunhong.jiang@intel.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 00ce07e6f2cabd..0048be79c7b9ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8040,6 +8040,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 	case EXIT_REASON_PCOMMIT:
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+	case EXIT_REASON_PREEMPTION_TIMER:
+		return false;
 	default:
 		return true;
 	}

From 9314006db8b781715658cd6a28994d84ccce5dee Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 6 Jul 2016 13:23:51 +0200
Subject: [PATCH 232/302] KVM: nVMX: keep preemption timer enabled during L2
 execution

Because the vmcs12 preemption timer is emulated through a separate hrtimer,
we can keep on using the preemption timer in the vmcs02 to emulare L1's
TSC deadline timer.

However, the corresponding bit in the pin-based execution control field
must be kept consistent between vmcs01 and vmcs02.  On vmentry we copy
it into the vmcs02; on vmexit the preemption timer must be disabled in
the vmcs01 if a preemption timer vmexit happened while in guest mode.

The preemption timer value in the vmcs02 is set by vmx_vcpu_run, so it
need not be considered in prepare_vmcs02.

Cc: Yunhong Jiang <yunhong.jiang@intel.com>
Cc: Haozhong Zhang <haozhong.zhang@intel.com>
Tested-by: Wanpeng Li <kernellwp@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0048be79c7b9ac..8cda4449a60e57 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9796,9 +9796,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
 	exec_control = vmcs12->pin_based_vm_exec_control;
-	exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+	/* Preemption timer setting is only taken from vmcs01.  */
 	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+	exec_control |= vmcs_config.pin_based_exec_ctrl;
+	if (vmx->hv_deadline_tsc == -1)
+		exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 
+	/* Posted interrupts setting is only taken from vmcs12.  */
 	if (nested_cpu_has_posted_intr(vmcs12)) {
 		/*
 		 * Note that we use L0's vector here and in
@@ -10727,8 +10732,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	load_vmcs12_host_state(vcpu, vmcs12);
 
-	/* Update TSC_OFFSET if TSC was changed while L2 ran */
+	/* Update any VMCS fields that might have changed while L2 ran */
 	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+	if (vmx->hv_deadline_tsc == -1)
+		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+				PIN_BASED_VMX_PREEMPTION_TIMER);
+	else
+		vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+			      PIN_BASED_VMX_PREEMPTION_TIMER);
 
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
 	vmx->host_rsp = 0;

From 8391ce447f18476273331399a7f5930e5494bf46 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 7 Jul 2016 14:58:33 +0200
Subject: [PATCH 233/302] KVM: VMX: introduce
 vm_{entry,exit}_control_reset_shadow

There is no reason to read the entry/exit control fields of the
VMCS and immediately write back the same value.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8cda4449a60e57..e515030631815f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1672,6 +1672,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
 	__vmcs_writel(field, __vmcs_readl(field) | mask);
 }
 
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+	vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
 static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
 	vmcs_write32(VM_ENTRY_CONTROLS, val);
@@ -1700,6 +1705,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
 	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
 }
 
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+	vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
 static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
 {
 	vmcs_write32(VM_EXIT_CONTROLS, val);
@@ -10722,8 +10732,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 				       vmcs12->vm_exit_intr_error_code,
 				       KVM_ISA_VMX);
 
-	vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
-	vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+	vm_entry_controls_reset_shadow(vmx);
+	vm_exit_controls_reset_shadow(vmx);
 	vmx_segment_cache_clear(vmx);
 
 	/* if no vmcs02 cache requested, remove the one we used */

From 8d5cf1610da526c3c1286bd7b3ac9f35f96ed43d Mon Sep 17 00:00:00 2001
From: Bandan Das <bsd@redhat.com>
Date: Tue, 12 Jul 2016 18:18:48 -0400
Subject: [PATCH 234/302] kvm: mmu: extend the is_present check to 32 bits

This is safe because this function is called
on host controlled page table and non-present/non-MMIO
sptes never use bits 1..31. For the EPT case, this
ensures that cases where only the execute bit is set
is marked valid.

Signed-off-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 837bf23c5b067c..55a20c0524feaf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -304,7 +304,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
+	return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)

From 812f30b234539ccb0139f92dfdbec1e8158cf535 Mon Sep 17 00:00:00 2001
From: Bandan Das <bsd@redhat.com>
Date: Tue, 12 Jul 2016 18:18:50 -0400
Subject: [PATCH 235/302] kvm: mmu: remove is_present_gpte()

We have two versions of the above function.
To prevent confusion and bugs in the future, remove
the non-FNAME version entirely and replace all calls
with the actual check.

Signed-off-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 2 +-
 arch/x86/kvm/mmu.h         | 5 -----
 arch/x86/kvm/paging_tmpl.h | 2 +-
 arch/x86/kvm/x86.c         | 2 +-
 4 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 55a20c0524feaf..6471f8788bd2f3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3189,7 +3189,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		MMU_WARN_ON(VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 			pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
-			if (!is_present_gpte(pdptr)) {
+			if (!(pdptr & PT_PRESENT_MASK)) {
 				vcpu->arch.mmu.pae_root[i] = 0;
 				continue;
 			}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 66b33b96a31b47..ddc56e91f2e491 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 	return kvm_mmu_load(vcpu);
 }
 
-static inline int is_present_gpte(unsigned long pte)
-{
-	return pte & PT_PRESENT_MASK;
-}
-
 /*
  * Currently, we have two sorts of write-protection, a) the first one
  * write-protects guest page to sync the guest modification, b) another one is
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bc019f70e0b6bb..fda5b64ae8f11c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
 #if PTTYPE != PTTYPE_EPT
-	return is_present_gpte(pte);
+	return pte & PT_PRESENT_MASK;
 #else
 	return pte & 7;
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0cc6cf834cdd7f..bb6e8bfaee3b68 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -540,7 +540,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 		goto out;
 	}
 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-		if (is_present_gpte(pdpte[i]) &&
+		if ((pdpte[i] & PT_PRESENT_MASK) &&
 		    (pdpte[i] &
 		     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
 			ret = 0;

From ffb128c89b77b44da18ccf51844a8e750e2c427a Mon Sep 17 00:00:00 2001
From: Bandan Das <bsd@redhat.com>
Date: Tue, 12 Jul 2016 18:18:49 -0400
Subject: [PATCH 236/302] kvm: mmu: don't set the present bit unconditionally

To support execute only mappings on behalf of L1
hypervisors, we need to teach set_spte() to honor all three of
L1's XWR bits.  As a start, add a new variable "shadow_present_mask"
that will be set for non-EPT shadow paging and clear for EPT.

Signed-off-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 13 +++++++------
 arch/x86/kvm/vmx.c              |  1 +
 arch/x86/kvm/x86.c              |  4 ++--
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7a628fb6a2c2b0..d0845b289adb39 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1031,7 +1031,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6471f8788bd2f3..b8628e905806e0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -175,6 +175,7 @@ static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_present_mask;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -282,13 +283,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 }
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
 {
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
+	shadow_present_mask = p_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -2245,10 +2247,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 {
 	u64 spte;
 
-	BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
-			VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
-	spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
 	       shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
 
 	mmu_spte_set(sptep, spte);
@@ -2515,13 +2516,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
 		    bool can_unsync, bool host_writable)
 {
-	u64 spte;
+	u64 spte = 0;
 	int ret = 0;
 
 	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
 		return 0;
 
-	spte = PT_PRESENT_MASK;
+	spte |= shadow_present_mask;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e515030631815f..a75d09d2a799ba 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6473,6 +6473,7 @@ static __init int hardware_setup(void)
 			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
 			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
 			0ull, VMX_EPT_EXECUTABLE_MASK);
+			0ull, VMX_EPT_EXECUTABLE_MASK, VMX_EPT_READABLE_MASK);
 		ept_set_mmio_spte_mask();
 		kvm_enable_tdp();
 	} else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bb6e8bfaee3b68..0c1fbb8d9d110a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5878,8 +5878,8 @@ int kvm_arch_init(void *opaque)
 	kvm_x86_ops = ops;
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-			PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
+			PT_DIRTY_MASK, PT64_NX_MASK, 0,
+			PT_PRESENT_MASK);
 	kvm_timer_init();
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);

From d95c55687e11febe3ab1aacfe82b58b1822c52c4 Mon Sep 17 00:00:00 2001
From: Bandan Das <bsd@redhat.com>
Date: Tue, 12 Jul 2016 18:18:51 -0400
Subject: [PATCH 237/302] kvm: mmu: track read permission explicitly for shadow
 EPT page tables

To support execute only mappings on behalf of L1 hypervisors,
reuse ACC_USER_MASK to signify if the L1 hypervisor has the R bit
set.

For the nested EPT case, we assumed that the U bit was always set
since there was no equivalent in EPT page tables.  Strictly
speaking, this was not necessary because handle_ept_violation
never set PFERR_USER_MASK in the error code (uf=0 in the
parlance of update_permission_bitmask).  We now have to set
both U and UF correctly, respectively in FNAME(gpte_access)
and in handle_ept_violation.

Also in handle_ept_violation bit 3 of the exit qualification is
not enough to detect a present PTE; all three bits 3-5 have to
be checked.

Signed-off-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 10 +++++++---
 arch/x86/kvm/paging_tmpl.h |  8 +++++++-
 arch/x86/kvm/vmx.c         | 15 +++++++++------
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b8628e905806e0..3041902ec827d8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2522,6 +2522,12 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
 		return 0;
 
+	/*
+	 * For the EPT case, shadow_present_mask is 0 if hardware
+	 * supports exec-only page table entries.  In that case,
+	 * ACC_USER_MASK and shadow_user_mask are used to represent
+	 * read access.  See FNAME(gpte_access) in paging_tmpl.h.
+	 */
 	spte |= shadow_present_mask;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
@@ -3915,9 +3921,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
 				 *   clearer.
 				 */
 				smap = cr4_smap && u && !uf && !ff;
-			} else
-				/* Not really needed: no U/S accesses on ept  */
-				u = 1;
+			}
 
 			fault = (ff && !x) || (uf && !u) || (wf && !w) ||
 				(smapf && smap);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index fda5b64ae8f11c..a01105485315ab 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -181,13 +181,19 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 	return true;
 }
 
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
 	unsigned access;
 #if PTTYPE == PTTYPE_EPT
 	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
 		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
-		ACC_USER_MASK;
+		((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
 #else
 	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
 	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a75d09d2a799ba..bd7d60f66b93fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6117,12 +6117,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
 
-	/* It is a write fault? */
-	error_code = exit_qualification & PFERR_WRITE_MASK;
+	/* it is a read fault? */
+	error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+	/* it is a write fault? */
+	error_code |= exit_qualification & PFERR_WRITE_MASK;
 	/* It is a fetch fault? */
 	error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
 	/* ept page table is present? */
-	error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+	error_code |= (exit_qualification & 0x38) != 0;
 
 	vcpu->arch.exit_qualification = exit_qualification;
 
@@ -6469,11 +6471,12 @@ static __init int hardware_setup(void)
 	vmx_disable_intercept_msr_write_x2apic(0x83f);
 
 	if (enable_ept) {
-		kvm_mmu_set_mask_ptes(0ull,
+		kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
 			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
 			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-			0ull, VMX_EPT_EXECUTABLE_MASK);
-			0ull, VMX_EPT_EXECUTABLE_MASK, VMX_EPT_READABLE_MASK);
+			0ull, VMX_EPT_EXECUTABLE_MASK,
+			cpu_has_vmx_ept_execute_only() ?
+				      0ull : VMX_EPT_READABLE_MASK);
 		ept_set_mmio_spte_mask();
 		kvm_enable_tdp();
 	} else

From 02120c45b07953ca4dfc19fa6ff90466efaf363f Mon Sep 17 00:00:00 2001
From: Bandan Das <bsd@redhat.com>
Date: Tue, 12 Jul 2016 18:18:52 -0400
Subject: [PATCH 238/302] kvm: vmx: advertise support for ept execute only

MMU now knows about execute only mappings, so
advertise the feature to L1 hypervisors

Signed-off-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bd7d60f66b93fb..729e5f689097ea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2794,6 +2794,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
 			 VMX_EPT_INVEPT_BIT;
+		if (cpu_has_vmx_ept_execute_only())
+			vmx->nested.nested_vmx_ept_caps |=
+				VMX_EPT_EXECUTE_ONLY_BIT;
 		vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
 		/*
 		 * For nested guests, we don't do anything specific

From 757883de41eca292765578ef87c4f49453529bb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:17 +0200
Subject: [PATCH 239/302] KVM: x86: bump KVM_SOFT_MAX_VCPUS to 240
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

240 has been well tested by Red Hat.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d0845b289adb39..5f90dce6fbd1ae 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,7 +35,7 @@
 #include <asm/kvm_page_track.h>
 
 #define KVM_MAX_VCPUS 255
-#define KVM_SOFT_MAX_VCPUS 160
+#define KVM_SOFT_MAX_VCPUS 240
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3

From 64aa47bfc45323040d5db8f30cbd6851f2606c7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:18 +0200
Subject: [PATCH 240/302] KVM: x86: add kvm_apic_map_get_dest_lapic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_irq_delivery_to_apic_fast and kvm_intr_is_single_vcpu_fast both
compute the interrupt destination.  Factor the code.

'struct kvm_lapic **dst = NULL' had to be added to silence GCC.
GCC might complain about potential NULL access in the future, because it
missed conditions that avoided uninitialized uses of dst.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 230 ++++++++++++++++++-------------------------
 1 file changed, 98 insertions(+), 132 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 22a6474af220e9..2987843657db37 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -671,102 +671,120 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
 	}
 }
 
-bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
-{
-	struct kvm_apic_map *map;
-	unsigned long bitmap = 1;
-	struct kvm_lapic **dst;
-	int i;
-	bool ret, x2apic_ipi;
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped.  In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+		struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+		struct kvm_apic_map *map, struct kvm_lapic ***dst,
+		unsigned long *bitmap)
+{
+	int i, lowest;
+	bool x2apic_ipi;
+	u16 cid;
+
+	if (irq->shorthand == APIC_DEST_SELF && src) {
+		*dst = src;
+		*bitmap = 1;
+		return true;
+	} else if (irq->shorthand)
+		return false;
 
-	*r = -1;
+	x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+	if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+		return false;
 
-	if (irq->shorthand == APIC_DEST_SELF) {
-		*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+	if (!map)
+		return false;
+
+	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+		if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) {
+			*bitmap = 0;
+		} else {
+			*dst = &map->phys_map[irq->dest_id];
+			*bitmap = 1;
+		}
 		return true;
 	}
 
-	if (irq->shorthand)
+	if (!kvm_apic_logical_map_valid(map))
 		return false;
 
-	x2apic_ipi = src && apic_x2apic_mode(src);
-	if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
-		return false;
-
-	ret = true;
-	rcu_read_lock();
-	map = rcu_dereference(kvm->arch.apic_map);
+	apic_logical_id(map, irq->dest_id, &cid, (u16 *)bitmap);
 
-	if (!map) {
-		ret = false;
-		goto out;
+	if (cid >= ARRAY_SIZE(map->logical_map)) {
+		*bitmap = 0;
+		return true;
 	}
 
-	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-			goto out;
+	*dst = map->logical_map[cid];
 
-		dst = &map->phys_map[irq->dest_id];
-	} else {
-		u16 cid;
+	if (!kvm_lowest_prio_delivery(irq))
+		return true;
 
-		if (!kvm_apic_logical_map_valid(map)) {
-			ret = false;
-			goto out;
+	if (!kvm_vector_hashing_enabled()) {
+		lowest = -1;
+		for_each_set_bit(i, bitmap, 16) {
+			if (!(*dst)[i])
+				continue;
+			if (lowest < 0)
+				lowest = i;
+			else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+						(*dst)[lowest]->vcpu) < 0)
+				lowest = i;
 		}
+	} else {
+		if (!*bitmap)
+			return true;
 
-		apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+		lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+				bitmap, 16);
 
-		if (cid >= ARRAY_SIZE(map->logical_map))
-			goto out;
+		if (!(*dst)[lowest]) {
+			kvm_apic_disabled_lapic_found(kvm);
+			*bitmap = 0;
+			return true;
+		}
+	}
 
-		dst = map->logical_map[cid];
+	*bitmap = (lowest >= 0) ? 1 << lowest : 0;
 
-		if (!kvm_lowest_prio_delivery(irq))
-			goto set_irq;
+	return true;
+}
 
-		if (!kvm_vector_hashing_enabled()) {
-			int l = -1;
-			for_each_set_bit(i, &bitmap, 16) {
-				if (!dst[i])
-					continue;
-				if (l < 0)
-					l = i;
-				else if (kvm_apic_compare_prio(dst[i]->vcpu,
-							dst[l]->vcpu) < 0)
-					l = i;
-			}
-			bitmap = (l >= 0) ? 1 << l : 0;
-		} else {
-			int idx;
-			unsigned int dest_vcpus;
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+	struct kvm_apic_map *map;
+	unsigned long bitmap;
+	struct kvm_lapic **dst = NULL;
+	int i;
+	bool ret;
 
-			dest_vcpus = hweight16(bitmap);
-			if (dest_vcpus == 0)
-				goto out;
+	*r = -1;
 
-			idx = kvm_vector_to_index(irq->vector,
-				dest_vcpus, &bitmap, 16);
+	if (irq->shorthand == APIC_DEST_SELF) {
+		*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+		return true;
+	}
 
-			if (!dst[idx]) {
-				kvm_apic_disabled_lapic_found(kvm);
-				goto out;
-			}
+	rcu_read_lock();
+	map = rcu_dereference(kvm->arch.apic_map);
 
-			bitmap = (idx >= 0) ? 1 << idx : 0;
+	ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+	if (ret)
+		for_each_set_bit(i, &bitmap, 16) {
+			if (!dst[i])
+				continue;
+			if (*r < 0)
+				*r = 0;
+			*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
 		}
-	}
 
-set_irq:
-	for_each_set_bit(i, &bitmap, 16) {
-		if (!dst[i])
-			continue;
-		if (*r < 0)
-			*r = 0;
-		*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
-	}
-out:
 	rcu_read_unlock();
 	return ret;
 }
@@ -789,8 +807,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu)
 {
 	struct kvm_apic_map *map;
+	unsigned long bitmap;
+	struct kvm_lapic **dst = NULL;
 	bool ret = false;
-	struct kvm_lapic *dst = NULL;
 
 	if (irq->shorthand)
 		return false;
@@ -798,69 +817,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 	rcu_read_lock();
 	map = rcu_dereference(kvm->arch.apic_map);
 
-	if (!map)
-		goto out;
-
-	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-		if (irq->dest_id == 0xFF)
-			goto out;
-
-		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
-			goto out;
-
-		dst = map->phys_map[irq->dest_id];
-		if (dst && kvm_apic_present(dst->vcpu))
-			*dest_vcpu = dst->vcpu;
-		else
-			goto out;
-	} else {
-		u16 cid;
-		unsigned long bitmap = 1;
-		int i, r = 0;
-
-		if (!kvm_apic_logical_map_valid(map))
-			goto out;
-
-		apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
-
-		if (cid >= ARRAY_SIZE(map->logical_map))
-			goto out;
-
-		if (kvm_vector_hashing_enabled() &&
-				kvm_lowest_prio_delivery(irq)) {
-			int idx;
-			unsigned int dest_vcpus;
+	if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+			hweight16(bitmap) == 1) {
+		unsigned long i = find_first_bit(&bitmap, 16);
 
-			dest_vcpus = hweight16(bitmap);
-			if (dest_vcpus == 0)
-				goto out;
-
-			idx = kvm_vector_to_index(irq->vector, dest_vcpus,
-						  &bitmap, 16);
-
-			dst = map->logical_map[cid][idx];
-			if (!dst) {
-				kvm_apic_disabled_lapic_found(kvm);
-				goto out;
-			}
-
-			*dest_vcpu = dst->vcpu;
-		} else {
-			for_each_set_bit(i, &bitmap, 16) {
-				dst = map->logical_map[cid][i];
-				if (++r == 2)
-					goto out;
-			}
-
-			if (dst && kvm_apic_present(dst->vcpu))
-				*dest_vcpu = dst->vcpu;
-			else
-				goto out;
+		if (dst[i]) {
+			*dest_vcpu = dst[i]->vcpu;
+			ret = true;
 		}
 	}
 
-	ret = true;
-out:
 	rcu_read_unlock();
 	return ret;
 }

From e45115b62f9abb143a03036dbde05faf5864aa01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:19 +0200
Subject: [PATCH 241/302] KVM: x86: use physical LAPIC array for logical x2APIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Logical x2APIC IDs map injectively to physical x2APIC IDs, so we can
reuse the physical array for them.  This allows us to save space by
sizing the logical maps according to the needs of xAPIC.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++-
 arch/x86/kvm/lapic.c            | 73 +++++++++++++++++----------------
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5f90dce6fbd1ae..623089c4e1a723 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -683,8 +683,10 @@ struct kvm_apic_map {
 	struct rcu_head rcu;
 	u8 mode;
 	struct kvm_lapic *phys_map[256];
-	/* first index is cluster id second is cpu id in a cluster */
-	struct kvm_lapic *logical_map[16][16];
+	union {
+		struct kvm_lapic *xapic_flat_map[8];
+		struct kvm_lapic *xapic_cluster_map[16][4];
+	};
 };
 
 /* Hyper-V emulation context */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2987843657db37..9880d03f533dde 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,26 +115,36 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
-/* The logical map is definitely wrong if we have multiple
- * modes at the same time.  (Physical map is always right.)
- */
-static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
-{
-	return !(map->mode & (map->mode - 1));
-}
-
-static inline void
-apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
-{
-	unsigned lid_bits;
-
-	BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
-	BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
-	BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
-	lid_bits = map->mode;
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+	switch (map->mode) {
+	case KVM_APIC_MODE_X2APIC: {
+		u32 offset = (dest_id >> 16) * 16;
+		u32 max_apic_id = ARRAY_SIZE(map->phys_map) - 1;
+
+		if (offset <= max_apic_id) {
+			u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+			*cluster = &map->phys_map[offset];
+			*mask = dest_id & (0xffff >> (16 - cluster_size));
+		} else {
+			*mask = 0;
+		}
 
-	*cid = dest_id >> lid_bits;
-	*lid = dest_id & ((1 << lid_bits) - 1);
+		return true;
+		}
+	case KVM_APIC_MODE_XAPIC_FLAT:
+		*cluster = map->xapic_flat_map;
+		*mask = dest_id & 0xff;
+		return true;
+	case KVM_APIC_MODE_XAPIC_CLUSTER:
+		*cluster = map->xapic_cluster_map[dest_id >> 4];
+		*mask = dest_id & 0xf;
+		return true;
+	default:
+		/* Not optimized. */
+		return false;
+	}
 }
 
 static void recalculate_apic_map(struct kvm *kvm)
@@ -152,7 +162,8 @@ static void recalculate_apic_map(struct kvm *kvm)
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvm_lapic *apic = vcpu->arch.apic;
-		u16 cid, lid;
+		struct kvm_lapic **cluster;
+		u16 mask;
 		u32 ldr, aid;
 
 		if (!kvm_apic_present(vcpu))
@@ -174,13 +185,11 @@ static void recalculate_apic_map(struct kvm *kvm)
 				new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
 		}
 
-		if (!kvm_apic_logical_map_valid(new))
+		if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
 			continue;
 
-		apic_logical_id(new, ldr, &cid, &lid);
-
-		if (lid && cid < ARRAY_SIZE(new->logical_map))
-			new->logical_map[cid][ffs(lid) - 1] = apic;
+		if (mask)
+			cluster[ffs(mask) - 1] = apic;
 	}
 out:
 	old = rcu_dereference_protected(kvm->arch.apic_map,
@@ -685,7 +694,6 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
 {
 	int i, lowest;
 	bool x2apic_ipi;
-	u16 cid;
 
 	if (irq->shorthand == APIC_DEST_SELF && src) {
 		*dst = src;
@@ -711,18 +719,11 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
 		return true;
 	}
 
-	if (!kvm_apic_logical_map_valid(map))
+	*bitmap = 0;
+	if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+				(u16 *)bitmap))
 		return false;
 
-	apic_logical_id(map, irq->dest_id, &cid, (u16 *)bitmap);
-
-	if (cid >= ARRAY_SIZE(map->logical_map)) {
-		*bitmap = 0;
-		return true;
-	}
-
-	*dst = map->logical_map[cid];
-
 	if (!kvm_lowest_prio_delivery(irq))
 		return true;
 

From 0ca52e7b81a37260c7edb823c8ac6a49c6280b5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:20 +0200
Subject: [PATCH 242/302] KVM: x86: dynamic kvm_apic_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

x2APIC supports up to 2^32-1 LAPICs, but most guest in coming years will
probably has fewer VCPUs.  Dynamic size saves memory at the cost of
turning one constant into a variable.

apic_map mutex had to be moved before allocation to avoid races with cpu
hotplug.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/lapic.c            | 18 +++++++++++++-----
 arch/x86/kvm/lapic.h            |  2 +-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 623089c4e1a723..a2832cc3cb8150 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -682,11 +682,12 @@ struct kvm_arch_memory_slot {
 struct kvm_apic_map {
 	struct rcu_head rcu;
 	u8 mode;
-	struct kvm_lapic *phys_map[256];
+	u32 max_apic_id;
 	union {
 		struct kvm_lapic *xapic_flat_map[8];
 		struct kvm_lapic *xapic_cluster_map[16][4];
 	};
+	struct kvm_lapic *phys_map[];
 };
 
 /* Hyper-V emulation context */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9880d03f533dde..224fc1c5fcc679 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -120,7 +120,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 	switch (map->mode) {
 	case KVM_APIC_MODE_X2APIC: {
 		u32 offset = (dest_id >> 16) * 16;
-		u32 max_apic_id = ARRAY_SIZE(map->phys_map) - 1;
+		u32 max_apic_id = map->max_apic_id;
 
 		if (offset <= max_apic_id) {
 			u8 cluster_size = min(max_apic_id - offset + 1, 16U);
@@ -152,14 +152,22 @@ static void recalculate_apic_map(struct kvm *kvm)
 	struct kvm_apic_map *new, *old = NULL;
 	struct kvm_vcpu *vcpu;
 	int i;
-
-	new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+	u32 max_id = 255;
 
 	mutex_lock(&kvm->arch.apic_map_lock);
 
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		if (kvm_apic_present(vcpu))
+			max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+
+	new = kzalloc(sizeof(struct kvm_apic_map) +
+	              sizeof(struct kvm_lapic *) * (max_id + 1), GFP_KERNEL);
+
 	if (!new)
 		goto out;
 
+	new->max_apic_id = max_id;
+
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvm_lapic *apic = vcpu->arch.apic;
 		struct kvm_lapic **cluster;
@@ -172,7 +180,7 @@ static void recalculate_apic_map(struct kvm *kvm)
 		aid = kvm_apic_id(apic);
 		ldr = kvm_lapic_get_reg(apic, APIC_LDR);
 
-		if (aid < ARRAY_SIZE(new->phys_map))
+		if (aid <= new->max_apic_id)
 			new->phys_map[aid] = apic;
 
 		if (apic_x2apic_mode(apic)) {
@@ -710,7 +718,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
 		return false;
 
 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-		if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) {
+		if (irq->dest_id > map->max_apic_id) {
 			*bitmap = 0;
 		} else {
 			*dst = &map->phys_map[irq->dest_id];
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 336ba51bb16ecc..8d811139d2b315 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -200,7 +200,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 	return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+static inline u32 kvm_apic_id(struct kvm_lapic *apic)
 {
 	return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }

From 3159d36ad799a117eb2f898de4c6b7777e4dc045 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:21 +0200
Subject: [PATCH 243/302] KVM: x86: use generic function for MSI parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index dfb4c64768771f..47ad681a33fdfd 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -388,21 +388,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
 			       kvm->arch.nr_reserved_ioapic_pins);
 	for (i = 0; i < nr_ioapic_pins; ++i) {
 		hlist_for_each_entry(entry, &table->map[i], link) {
-			u32 dest_id, dest_mode;
-			bool level;
+			struct kvm_lapic_irq irq;
 
 			if (entry->type != KVM_IRQ_ROUTING_MSI)
 				continue;
-			dest_id = (entry->msi.address_lo >> 12) & 0xff;
-			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
-			level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
-			if (level && kvm_apic_match_dest(vcpu, NULL, 0,
-						dest_id, dest_mode)) {
-				u32 vector = entry->msi.data & 0xff;
-
-				__set_bit(vector,
-					  ioapic_handled_vectors);
-			}
+
+			kvm_set_msi_irq(entry, &irq);
+
+			if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+						irq.dest_id, irq.dest_mode))
+				__set_bit(irq.vector, ioapic_handled_vectors);
 		}
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);

From a92e2543d6a8653a8ab45cf5df7ef07dafcf3f3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:22 +0200
Subject: [PATCH 244/302] KVM: x86: use hardware-compatible format for APIC ID
 register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We currently always shift APIC ID as if APIC was in xAPIC mode.
x2APIC mode wants to use more bits and storing a hardware-compabible
value is the the sanest option.

KVM API to set the lapic expects that bottom 8 bits of APIC ID are in
top 8 bits of APIC_ID register, so the register needs to be shifted in
x2APIC mode.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 52 +++++++++++++++++++++++++++++++-------------
 arch/x86/kvm/lapic.h | 12 +++++++---
 arch/x86/kvm/x86.c   | 10 +++++----
 3 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 224fc1c5fcc679..41089dbeeafcb0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -227,7 +227,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 	}
 }
 
-static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
 {
 	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
 	recalculate_apic_map(apic->vcpu->kvm);
@@ -239,11 +239,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 	recalculate_apic_map(apic->vcpu->kvm);
 }
 
-static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 {
 	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
 
-	kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
 	recalculate_apic_map(apic->vcpu->kvm);
 }
@@ -1102,12 +1102,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 		return 0;
 
 	switch (offset) {
-	case APIC_ID:
-		if (apic_x2apic_mode(apic))
-			val = kvm_apic_id(apic);
-		else
-			val = kvm_apic_id(apic) << 24;
-		break;
 	case APIC_ARBPRI:
 		apic_debug("Access APIC ARBPRI register which is for P6\n");
 		break;
@@ -1465,7 +1459,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	switch (reg) {
 	case APIC_ID:		/* Local APIC ID */
 		if (!apic_x2apic_mode(apic))
-			kvm_apic_set_id(apic, val >> 24);
+			kvm_apic_set_xapic_id(apic, val >> 24);
 		else
 			ret = 1;
 		break;
@@ -1769,7 +1763,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
 	if (!init_event)
-		kvm_apic_set_id(apic, vcpu->vcpu_id);
+		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
 	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -1990,17 +1984,43 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	return vector;
 }
 
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-		struct kvm_lapic_state *s)
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+		struct kvm_lapic_state *s, bool set)
+{
+	if (apic_x2apic_mode(vcpu->arch.apic)) {
+		u32 *id = (u32 *)(s->regs + APIC_ID);
+
+		if (set)
+			*id >>= 24;
+		else
+			*id <<= 24;
+	}
+
+	return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+	memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+	return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	int r;
+
 
 	kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
 	/* set SPIV separately to get count of SW disabled APICs right */
 	apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+	r = kvm_apic_state_fixup(vcpu, s, true);
+	if (r)
+		return r;
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-	/* call kvm_apic_set_id() to put apic into apic_map */
-	kvm_apic_set_id(apic, kvm_apic_id(apic));
+
+	recalculate_apic_map(vcpu->kvm);
 	kvm_apic_set_version(vcpu);
 
 	apic_update_ppr(apic);
@@ -2026,6 +2046,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		kvm_rtc_eoi_tracking_restore_one(vcpu);
 
 	vcpu->arch.apic_arb_prio = 0;
+
+	return 0;
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 8d811139d2b315..f60d01c29d5107 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -81,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
-		struct kvm_lapic_state *s);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -202,7 +202,13 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 
 static inline u32 kvm_apic_id(struct kvm_lapic *apic)
 {
-	return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+	/* To avoid a race between apic_base and following APIC_ID update when
+	 * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
+	 */
+	if (apic_x2apic_mode(apic))
+		return apic->vcpu->vcpu_id;
+
+	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
 }
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c1fbb8d9d110a..b6e402d16e0c5f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2779,15 +2779,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 	if (vcpu->arch.apicv_active)
 		kvm_x86_ops->sync_pir_to_irr(vcpu);
 
-	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
-	return 0;
+	return kvm_apic_get_state(vcpu, s);
 }
 
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	kvm_apic_post_state_restore(vcpu, s);
+	int r;
+
+	r = kvm_apic_set_state(vcpu, s);
+	if (r)
+		return r;
 	update_cr8_intercept(vcpu);
 
 	return 0;

From 49bd29ba1dbd57b029f69cd9afb335a8f564f32f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:23 +0200
Subject: [PATCH 245/302] KVM: x86: reset APIC ID when enabling LAPIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

APIC ID should be set to the initial APIC ID when enabling LAPIC.
This only matters if the guest changes APIC ID.  No sane OS does that.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 41089dbeeafcb0..0fce77fdbe91e8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1720,9 +1720,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 	/* update jump label if enable bit changes */
 	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
-		if (value & MSR_IA32_APICBASE_ENABLE)
+		if (value & MSR_IA32_APICBASE_ENABLE) {
+			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
 			static_key_slow_dec_deferred(&apic_hw_disabled);
-		else
+		} else
 			static_key_slow_inc(&apic_hw_disabled.key);
 		recalculate_apic_map(vcpu->kvm);
 	}

From c93de59dcd9bef0044e615493ab52d3958243d87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:24 +0200
Subject: [PATCH 246/302] KVM: VMX: optimize APIC ID read with APICv
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The register is in hardware-compatible format now, so there is not need
to intercept.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 729e5f689097ea..7bdd6b1a2373e3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6461,9 +6461,6 @@ static __init int hardware_setup(void)
 	for (msr = 0x800; msr <= 0x8ff; msr++)
 		vmx_disable_intercept_msr_read_x2apic(msr);
 
-	/* According SDM, in x2apic mode, the whole id reg is used.  But in
-	 * KVM, it only use the highest eight bits. Need to intercept it */
-	vmx_enable_intercept_msr_read_x2apic(0x802);
 	/* TMCCT */
 	vmx_enable_intercept_msr_read_x2apic(0x839);
 	/* TPR */

From 4d8e772bf8e3fcf55fe17e84ce68c20e03041efc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:25 +0200
Subject: [PATCH 247/302] KVM: x86: reset lapic base in kvm_lapic_reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LAPIC is reset in xAPIC mode and the surrounding code expects that.
KVM never resets after initialization.  This patch is just for sanity.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fce77fdbe91e8..3c2a8c1130549d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1763,8 +1763,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
-	if (!init_event)
+	if (!init_event) {
+		kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+		                         MSR_IA32_APICBASE_ENABLE);
 		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+	}
 	kvm_apic_set_version(apic->vcpu);
 
 	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -1903,9 +1906,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	 * thinking that APIC satet has changed.
 	 */
 	vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
-	kvm_lapic_set_base(vcpu,
-			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
-
 	static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
 	kvm_lapic_reset(vcpu, false);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);

From c63cf538eb4bf6a5ffd3750366d8d56f023645a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:26 +0200
Subject: [PATCH 248/302] KVM: pass struct kvm to kvm_set_routing_entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Arch-specific code will use it.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/mpic.c   | 3 ++-
 arch/s390/kvm/interrupt.c | 3 ++-
 arch/x86/kvm/irq_comm.c   | 3 ++-
 include/linux/kvm_host.h  | 3 ++-
 virt/kvm/irqchip.c        | 7 ++++---
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 6249cdc834d149..ed38f811411847 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	return 0;
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
 	int r = -EINVAL;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ca19627779db09..24524c0f3ef88b 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2246,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	return ret;
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
 	int ret;
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 47ad681a33fdfd..889563d50c55a5 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -248,7 +248,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
 }
 
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
 	int r = -EINVAL;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 66b2f6159aadd3..60d339faa94c4d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1011,7 +1011,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
 			unsigned flags);
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue);
 void kvm_free_irq_routing(struct kvm *kvm);
 
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 8db197bb6c7a92..df99e9c3b64d31 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm)
 	free_irq_routing_table(rt);
 }
 
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+static int setup_routing_entry(struct kvm *kvm,
+			       struct kvm_irq_routing_table *rt,
 			       struct kvm_kernel_irq_routing_entry *e,
 			       const struct kvm_irq_routing_entry *ue)
 {
@@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 
 	e->gsi = ue->gsi;
 	e->type = ue->type;
-	r = kvm_set_routing_entry(e, ue);
+	r = kvm_set_routing_entry(kvm, e, ue);
 	if (r)
 		goto out;
 	if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
@@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			kfree(e);
 			goto out;
 		}
-		r = setup_routing_entry(new, e, ue);
+		r = setup_routing_entry(kvm, new, e, ue);
 		if (r) {
 			kfree(e);
 			goto out;

From 3713131345fbea291cbd859d248e06ed77815962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:27 +0200
Subject: [PATCH 249/302] KVM: x86: add KVM_CAP_X2APIC_API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM_CAP_X2APIC_API is a capability for features related to x2APIC
enablement.  KVM_X2APIC_API_32BIT_FORMAT feature can be enabled to
extend APIC ID in get/set ioctl and MSI addresses to 32 bits.
Both are needed to support x2APIC.

The feature has to be enableable and disabled by default, because
get/set ioctl shifted and truncated APIC ID to 8 bits by using a
non-standard protocol inspired by xAPIC and the change is not
backward-compatible.

Changes to MSI addresses follow the format used by interrupt remapping
unit.  The upper address word, that used to be 0, contains upper 24 bits
of the LAPIC address in its upper 24 bits.  Lower 8 bits are reserved as
0.  Using the upper address word is not backward-compatible either as we
didn't check that userspace zeroed the word.  Reserved bits are still
not explicitly checked, but non-zero data will affect LAPIC addresses,
which will cause a bug.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 41 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h   |  4 ++-
 arch/x86/kvm/irq_comm.c           | 29 ++++++++++++++++++----
 arch/x86/kvm/lapic.c              | 13 +++++++---
 arch/x86/kvm/vmx.c                |  2 +-
 arch/x86/kvm/x86.c                | 15 +++++++++++
 include/trace/events/kvm.h        |  5 ++--
 include/uapi/linux/kvm.h          |  3 +++
 8 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 09efa9eb3926d5..e34e51fa28b06a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi {
 	__u32 pad;
 };
 
+On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
+feature of KVM_CAP_X2APIC_API capability is enabled.  If it is enabled,
+address_hi bits 31-8 provide bits 31-8 of the destination id.  Bits 7-0 of
+address_hi must be zero.
+
 struct kvm_irq_routing_s390_adapter {
 	__u64 ind_addr;
 	__u64 summary_addr;
@@ -1583,6 +1588,17 @@ struct kvm_lapic_state {
 Reads the Local APIC registers and copies them into the input argument.  The
 data format and layout are the same as documented in the architecture manual.
 
+If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is
+enabled, then the format of APIC_ID register depends on the APIC mode
+(reported by MSR_IA32_APICBASE) of its VCPU.  x2APIC stores APIC ID in
+the APIC_ID register (bytes 32-35).  xAPIC only allows an 8-bit APIC ID
+which is stored in bits 31-24 of the APIC register, or equivalently in
+byte 35 of struct kvm_lapic_state's regs field.  KVM_GET_LAPIC must then
+be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR.
+
+If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state
+always uses xAPIC format.
+
 
 4.58 KVM_SET_LAPIC
 
@@ -1600,6 +1616,10 @@ struct kvm_lapic_state {
 Copies the input argument into the Local APIC registers.  The data format
 and layout are the same as documented in the architecture manual.
 
+The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's
+regs field) depends on the state of the KVM_CAP_X2APIC_API capability.
+See the note in KVM_GET_LAPIC.
+
 
 4.59 KVM_IOEVENTFD
 
@@ -2180,6 +2200,10 @@ struct kvm_msi {
 
 No flags are defined so far. The corresponding field must be 0.
 
+On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is
+enabled.  If it is enabled, address_hi bits 31-8 provide bits 31-8 of the
+destination id.  Bits 7-0 of address_hi must be zero.
+
 
 4.71 KVM_CREATE_PIT2
 
@@ -3811,6 +3835,23 @@ Allows use of runtime-instrumentation introduced with zEC12 processor.
 Will return -EINVAL if the machine does not support runtime-instrumentation.
 Will return -EBUSY if a VCPU has already been created.
 
+7.7 KVM_CAP_X2APIC_API
+
+Architectures: x86
+Parameters: args[0] - features that should be enabled
+Returns: 0 on success, -EINVAL when args[0] contains invalid features
+
+Valid feature flags in args[0] are
+
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+
+Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
+KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
+allowing the use of 32-bit APIC IDs.  See KVM_CAP_X2APIC_API in their
+respective sections.
+
+
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a2832cc3cb8150..7c00ba3242d71b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -782,6 +782,8 @@ struct kvm_arch {
 	u32 ldr_mode;
 	struct page *avic_logical_id_table_page;
 	struct page *avic_physical_id_table_page;
+
+	bool x2apic_format;
 };
 
 struct kvm_vm_stat {
@@ -1364,7 +1366,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			     struct kvm_vcpu **dest_vcpu);
 
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 889563d50c55a5..25810b144b58d9 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 }
 
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq)
 {
-	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+	trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
+	                                     (u64)e->msi.address_hi << 32 : 0),
+	                      e->msi.data);
 
 	irq->dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+	if (kvm->arch.x2apic_format)
+		irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
 	irq->vector = (e->msi.data &
 			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
 	irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
@@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+		struct kvm_kernel_irq_routing_entry *e)
+{
+	return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
 {
 	struct kvm_lapic_irq irq;
 
+	if (kvm_msi_route_invalid(kvm, e))
+		return -EINVAL;
+
 	if (!level)
 		return -1;
 
-	kvm_set_msi_irq(e, &irq);
+	kvm_set_msi_irq(kvm, e, &irq);
 
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
 }
@@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 	if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
 		return -EWOULDBLOCK;
 
-	kvm_set_msi_irq(e, &irq);
+	if (kvm_msi_route_invalid(kvm, e))
+		return -EINVAL;
+
+	kvm_set_msi_irq(kvm, e, &irq);
 
 	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
 		return r;
@@ -286,6 +302,9 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.address_lo = ue->u.msi.address_lo;
 		e->msi.address_hi = ue->u.msi.address_hi;
 		e->msi.data = ue->u.msi.data;
+
+		if (kvm_msi_route_invalid(kvm, e))
+			goto out;
 		break;
 	case KVM_IRQ_ROUTING_HV_SINT:
 		e->set = kvm_hv_set_sint;
@@ -394,7 +413,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
 			if (entry->type != KVM_IRQ_ROUTING_MSI)
 				continue;
 
-			kvm_set_msi_irq(entry, &irq);
+			kvm_set_msi_irq(vcpu->kvm, entry, &irq);
 
 			if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
 						irq.dest_id, irq.dest_mode))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3c2a8c1130549d..d27a7829a4cedc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1991,10 +1991,15 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 	if (apic_x2apic_mode(vcpu->arch.apic)) {
 		u32 *id = (u32 *)(s->regs + APIC_ID);
 
-		if (set)
-			*id >>= 24;
-		else
-			*id <<= 24;
+		if (vcpu->kvm->arch.x2apic_format) {
+			if (*id != vcpu->vcpu_id)
+				return -EINVAL;
+		} else {
+			if (set)
+				*id >>= 24;
+			else
+				*id <<= 24;
+		}
 	}
 
 	return 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bdd6b1a2373e3..b61cdadf8623d6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11104,7 +11104,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		 * We will support full lowest-priority interrupt later.
 		 */
 
-		kvm_set_msi_irq(e, &irq);
+		kvm_set_msi_irq(kvm, e, &irq);
 		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
 			/*
 			 * Make sure the IRTE is in remapped mode if
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6e402d16e0c5f..d86f563a689601 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -90,6 +90,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS)
+
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
 static void enter_smm(struct kvm_vcpu *vcpu);
@@ -2625,6 +2627,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
+	case KVM_CAP_X2APIC_API:
+		r = KVM_X2APIC_API_VALID_FLAGS;
+		break;
 	default:
 		r = 0;
 		break;
@@ -3799,6 +3804,16 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		mutex_unlock(&kvm->lock);
 		break;
 	}
+	case KVM_CAP_X2APIC_API:
+		r = -EINVAL;
+		if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+			break;
+
+		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+			kvm->arch.x2apic_format = true;
+
+		r = 0;
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index f28292d73ddba7..8ade3eb6c640e0 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq,
 		__entry->data		= data;
 	),
 
-	TP_printk("dst %u vec %u (%s|%s|%s%s)",
-		  (u8)(__entry->address >> 12), (u8)__entry->data,
+	TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+		  (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+		  (u8)__entry->data,
 		  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
 		  (__entry->address & (1<<2)) ? "logical" : "physical",
 		  (__entry->data & (1<<15)) ? "level" : "edge",
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf475104ca1..f704403e19a035 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1313,4 +1314,6 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+
 #endif /* __LINUX_KVM_H */

From c519265f2aa348b2f1b9ecf8fbe20bb7c0fb102e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:28 +0200
Subject: [PATCH 250/302] KVM: x86: add a flag to disable KVM x2apic broadcast
 quirk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK as a feature flag to
KVM_CAP_X2APIC_API.

The quirk made KVM interpret 0xff as a broadcast even in x2APIC mode.
The enableable capability is needed in order to support standard x2APIC and
remain backward compatible.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[Expand kvm_apic_mda comment. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  6 ++++
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/lapic.c              | 53 +++++++++++++++++++++++--------
 arch/x86/kvm/x86.c                |  5 ++-
 include/uapi/linux/kvm.h          |  1 +
 5 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e34e51fa28b06a..c4d2fb0e28dedd 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3844,12 +3844,18 @@ Returns: 0 on success, -EINVAL when args[0] contains invalid features
 Valid feature flags in args[0] are
 
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
 Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
 KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
 allowing the use of 32-bit APIC IDs.  See KVM_CAP_X2APIC_API in their
 respective sections.
 
+KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work
+in logical mode or with more than 255 VCPUs.  Otherwise, KVM treats 0xff
+as a broadcast even in x2APIC mode in order to support physical x2APIC
+without interrupt remapping.  This is undesirable in logical mode,
+where 0xff represents CPUs 0-7 in cluster 0.
 
 
 8. Other capabilities.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7c00ba3242d71b..074b5c760327a3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -784,6 +784,7 @@ struct kvm_arch {
 	struct page *avic_physical_id_table_page;
 
 	bool x2apic_format;
+	bool x2apic_broadcast_quirk_disabled;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d27a7829a4cedc..a16e0bb95d280c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -616,17 +616,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 	}
 }
 
-/* KVM APIC implementation has two quirks
- *  - dest always begins at 0 while xAPIC MDA has offset 24,
- *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+/* The KVM local APIC implementation has two quirks:
+ *
+ *  - the xAPIC MDA stores the destination at bits 24-31, while this
+ *    is not true of struct kvm_lapic_irq's dest_id field.  This is
+ *    just a quirk in the API and is not problematic.
+ *
+ *  - in-kernel IOAPIC messages have to be delivered directly to
+ *    x2APIC, because the kernel does not support interrupt remapping.
+ *    In order to support broadcast without interrupt remapping, x2APIC
+ *    rewrites the destination of non-IPI messages from APIC_BROADCAST
+ *    to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API.  This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
  */
-static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
-                                              struct kvm_lapic *target)
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+		struct kvm_lapic *source, struct kvm_lapic *target)
 {
 	bool ipi = source != NULL;
 	bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
 
-	if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+	if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+	    !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
 		return X2APIC_BROADCAST;
 
 	return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@ -636,7 +649,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode)
 {
 	struct kvm_lapic *target = vcpu->arch.apic;
-	u32 mda = kvm_apic_mda(dest, source, target);
+	u32 mda = kvm_apic_mda(vcpu, dest, source, target);
 
 	apic_debug("target %p, source %p, dest 0x%x, "
 		   "dest_mode 0x%x, short_hand 0x%x\n",
@@ -688,6 +701,25 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
 	}
 }
 
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+		struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
+{
+	if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+		if ((irq->dest_id == APIC_BROADCAST &&
+				map->mode != KVM_APIC_MODE_X2APIC))
+			return true;
+		if (irq->dest_id == X2APIC_BROADCAST)
+			return true;
+	} else {
+		bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+		if (irq->dest_id == (x2apic_ipi ?
+		                     X2APIC_BROADCAST : APIC_BROADCAST))
+			return true;
+	}
+
+	return false;
+}
+
 /* Return true if the interrupt can be handled by using *bitmap as index mask
  * for valid destinations in *dst array.
  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
@@ -701,7 +733,6 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
 		unsigned long *bitmap)
 {
 	int i, lowest;
-	bool x2apic_ipi;
 
 	if (irq->shorthand == APIC_DEST_SELF && src) {
 		*dst = src;
@@ -710,11 +741,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
 	} else if (irq->shorthand)
 		return false;
 
-	x2apic_ipi = src && *src && apic_x2apic_mode(*src);
-	if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
-		return false;
-
-	if (!map)
+	if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
 		return false;
 
 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d86f563a689601..f0d23622bc4e36 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -90,7 +90,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
-#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS)
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
@@ -3811,6 +3812,8 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 
 		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
 			kvm->arch.x2apic_format = true;
+		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+			kvm->arch.x2apic_broadcast_quirk_disabled = true;
 
 		r = 0;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f704403e19a035..4f8030e5b05d22 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1315,5 +1315,6 @@ struct kvm_assigned_msix_entry {
 };
 
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
 #endif /* __LINUX_KVM_H */

From 682f732ecf7396e9d6fe24d44738966699fae6c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:29 +0200
Subject: [PATCH 251/302] KVM: x86: bump MAX_VCPUS to 288
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

288 is in high demand because of Knights Landing CPU.
We cannot set the limit to 640k, because that would be wasting space.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 074b5c760327a3..21a40dc7aad6f7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -34,7 +34,7 @@
 #include <asm/asm.h>
 #include <asm/kvm_page_track.h>
 
-#define KVM_MAX_VCPUS 255
+#define KVM_MAX_VCPUS 288
 #define KVM_SOFT_MAX_VCPUS 240
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */

From af1bae5497b98cb99d6b0492e6981f060420a00c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Tue, 12 Jul 2016 22:09:30 +0200
Subject: [PATCH 252/302] KVM: x86: bump KVM_MAX_VCPU_ID to 1023
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kzalloc was replaced with kvm_kvzalloc to allow non-contiguous areas and
rcu had to be modified to cope with it.

The practical limit for KVM_MAX_VCPU_ID right now is INT_MAX, but lower
value was chosen in case there were bugs.  1023 is sufficient maximum
APIC ID for 288 VCPUs.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/lapic.c            | 13 ++++++++++---
 arch/x86/kvm/x86.c              |  2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 21a40dc7aad6f7..9fcb197aa5cee0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -36,6 +36,7 @@
 
 #define KVM_MAX_VCPUS 288
 #define KVM_SOFT_MAX_VCPUS 240
+#define KVM_MAX_VCPU_ID 1023
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a16e0bb95d280c..6895fd28aae97c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -147,6 +147,13 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 	}
 }
 
+static void kvm_apic_map_free(struct rcu_head *rcu)
+{
+	struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
+
+	kvfree(map);
+}
+
 static void recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
@@ -160,8 +167,8 @@ static void recalculate_apic_map(struct kvm *kvm)
 		if (kvm_apic_present(vcpu))
 			max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
 
-	new = kzalloc(sizeof(struct kvm_apic_map) +
-	              sizeof(struct kvm_lapic *) * (max_id + 1), GFP_KERNEL);
+	new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
+	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
 
 	if (!new)
 		goto out;
@@ -206,7 +213,7 @@ static void recalculate_apic_map(struct kvm *kvm)
 	mutex_unlock(&kvm->arch.apic_map_lock);
 
 	if (old)
-		kfree_rcu(old, rcu);
+		call_rcu(&old->rcu, kvm_apic_map_free);
 
 	kvm_make_scan_ioapic_request(kvm);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0d23622bc4e36..a27b33033700aa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7927,7 +7927,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
-	kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 	kvm_mmu_uninit_vm(kvm);
 }
 

From 40c4f8d27296428c894551c9e30d8016a2551116 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 14 Jul 2016 13:19:34 +0300
Subject: [PATCH 253/302] arm64: KVM: Clean up a condition

My static checker complains that this condition looks like it should be
== instead of =.  This isn't a fast path, so we don't need to be fancy.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/kvm/sys_regs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a57d650f552cd6..b0b225ceca18f9 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1546,7 +1546,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
 				struct sys_reg_params *params)
 {
 	u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
-	int cp;
+	int cp = -1;
 
 	switch(hsr_ec) {
 	case ESR_ELx_EC_CP15_32:
@@ -1558,7 +1558,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
 		cp = 14;
 		break;
 	default:
-		WARN_ON((cp = -1));
+		WARN_ON(1);
 	}
 
 	kvm_err("Unsupported guest CP%d access at: %08lx\n",

From 6502a34cfd6695929086187f63fe670cc3050e68 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 21 Jun 2016 14:19:51 +0200
Subject: [PATCH 254/302] KVM: s390: allow user space to handle instr 0x0000

We will use illegal instruction 0x0000 for handling 2 byte sw breakpoints
from user space. As it can be enabled dynamically via a capability,
let's move setting of ICTL_OPEREXC to the post creation step, so we avoid
any races when enabling that capability just while adding new cpus.

Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt | 13 +++++++++++++
 arch/s390/include/asm/kvm_host.h  |  2 ++
 arch/s390/kvm/intercept.c         |  3 +++
 arch/s390/kvm/kvm-s390.c          | 26 ++++++++++++++++++++++++--
 include/uapi/linux/kvm.h          |  1 +
 5 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index c4d2fb0e28dedd..299306db5d8413 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3857,6 +3857,19 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC
 without interrupt remapping.  This is undesirable in logical mode,
 where 0xff represents CPUs 0-7 in cluster 0.
 
+7.8 KVM_CAP_S390_USER_INSTR0
+
+Architectures: s390
+Parameters: none
+
+With this capability enabled, all illegal instructions 0x0000 (2 bytes) will
+be intercepted and forwarded to user space. User space can use this
+mechanism e.g. to realize 2-byte software breakpoints. The kernel will
+not inject an operating exception for these instructions, user space has
+to take care of that.
+
+This capability can be enabled dynamically even if VCPUs were already
+created and are running.
 
 8. Other capabilities.
 ----------------------
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 946fc86202fdac..183b01727de4e1 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
 /* s390-specific vcpu->requests bit members */
 #define KVM_REQ_ENABLE_IBS         8
 #define KVM_REQ_DISABLE_IBS        9
+#define KVM_REQ_ICPT_OPEREXC       10
 
 #define SIGP_CTRL_C		0x80
 #define SIGP_CTRL_SCN_MASK	0x3f
@@ -666,6 +667,7 @@ struct kvm_arch{
 	int user_cpu_state_ctrl;
 	int user_sigp;
 	int user_stsi;
+	int user_instr0;
 	struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
 	wait_queue_head_t ipte_wq;
 	int ipte_lock_count;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 850be47c4cc93f..7a2f1551bc3954 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -359,6 +359,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 	    test_kvm_facility(vcpu->kvm, 74))
 		return handle_sthyi(vcpu);
 
+	if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+		return -EOPNOTSUPP;
+
 	return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d42428c1179412..63ac7c1641a7b1 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -364,6 +364,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_USER_STSI:
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
+	case KVM_CAP_S390_USER_INSTR0:
 		r = 1;
 		break;
 	case KVM_CAP_S390_MEM_OP:
@@ -456,6 +457,16 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	return r;
 }
 
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+	}
+}
+
 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
 	int r;
@@ -507,6 +518,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		kvm->arch.user_stsi = 1;
 		r = 0;
 		break;
+	case KVM_CAP_S390_USER_INSTR0:
+		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+		kvm->arch.user_instr0 = 1;
+		icpt_operexc_on_all_vcpus(kvm);
+		r = 0;
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1836,6 +1853,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 		vcpu->arch.gmap = vcpu->kvm->arch.gmap;
 		sca_add_vcpu(vcpu);
 	}
+	if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
 	/* make vcpu_load load the right gmap on the first trigger */
 	vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
@@ -1923,8 +1942,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
 	vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
-	if (test_kvm_facility(vcpu->kvm, 74))
-		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
 
 	if (vcpu->kvm->arch.use_cmma) {
 		rc = kvm_s390_vcpu_setup_cmma(vcpu);
@@ -2369,6 +2386,11 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 		goto retry;
 	}
 
+	if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+		vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+		goto retry;
+	}
+
 	/* nothing to do, just clear the request */
 	clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4f8030e5b05d22..70941f4ab6d800 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -867,6 +867,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
 #define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From 9acc317b183fdd3ed3bca218271875c0e808daae Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 18 Jul 2016 09:18:13 +0200
Subject: [PATCH 255/302] KVM: s390: let ptff intercepts result in cc=3

We don't emulate ptff subfunctions, therefore react on any attempt of
execution by setting cc=3 (Requested function not available).

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/priv.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index c77ad2dc334ff7..46160388e9964b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1185,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int handle_ptff(struct kvm_vcpu *vcpu)
+{
+	/* we don't emulate any control instructions yet */
+	kvm_s390_set_psw_cc(vcpu, 3);
+	return 0;
+}
+
 static const intercept_handler_t x01_handlers[256] = {
+	[0x04] = handle_ptff,
 	[0x07] = handle_sckpf,
 };
 

From 8f6cdc1c2eec20c3bbf3a83ad0e1db165f709917 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:22 +0100
Subject: [PATCH 256/302] KVM: arm/arm64: vgic: Move redistributor
 kvm_io_devices

Logically a GICv3 redistributor is assigned to a (v)CPU, so we should
aim to keep redistributor related variables out of our struct vgic_dist.

Let's start by replacing the redistributor related kvm_io_device array
with two members in our existing struct vgic_cpu, which are naturally
per-VCPU and thus don't require any allocation / freeing.
So apart from the better fit with the redistributor design this saves
some code as well.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h           |  8 +++++++-
 virt/kvm/arm/vgic/vgic-init.c    |  1 -
 virt/kvm/arm/vgic/vgic-mmio-v3.c | 22 ++++++++--------------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 12640378db9899..5142e2ab9f5e1d 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -145,7 +145,6 @@ struct vgic_dist {
 	struct vgic_irq		*spis;
 
 	struct vgic_io_device	dist_iodev;
-	struct vgic_io_device	*redist_iodevs;
 };
 
 struct vgic_v2_cpu_if {
@@ -193,6 +192,13 @@ struct vgic_cpu {
 	struct list_head ap_list_head;
 
 	u64 live_lrs;
+
+	/*
+	 * Members below are used with GICv3 emulation only and represent
+	 * parts of the redistributor.
+	 */
+	struct vgic_io_device	rd_iodev;
+	struct vgic_io_device	sgi_iodev;
 };
 
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index a1442f7c9c4d3e..90cae489c34cdd 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -271,7 +271,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
 	dist->initialized = false;
 
 	kfree(dist->spis);
-	kfree(dist->redist_iodevs);
 	dist->nr_spis = 0;
 
 	mutex_unlock(&kvm->lock);
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index a0c515a412a7c4..fc7b6c97acbb25 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -285,21 +285,14 @@ unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
 
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 {
-	int nr_vcpus = atomic_read(&kvm->online_vcpus);
 	struct kvm_vcpu *vcpu;
-	struct vgic_io_device *devices;
 	int c, ret = 0;
 
-	devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2,
-			  GFP_KERNEL);
-	if (!devices)
-		return -ENOMEM;
-
 	kvm_for_each_vcpu(c, vcpu, kvm) {
 		gpa_t rd_base = redist_base_address + c * SZ_64K * 2;
 		gpa_t sgi_base = rd_base + SZ_64K;
-		struct vgic_io_device *rd_dev = &devices[c * 2];
-		struct vgic_io_device *sgi_dev = &devices[c * 2 + 1];
+		struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+		struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
 
 		kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
 		rd_dev->base_addr = rd_base;
@@ -335,14 +328,15 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 	if (ret) {
 		/* The current c failed, so we start with the previous one. */
 		for (c--; c >= 0; c--) {
+			struct vgic_cpu *vgic_cpu;
+
+			vcpu = kvm_get_vcpu(kvm, c);
+			vgic_cpu = &vcpu->arch.vgic_cpu;
 			kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-						  &devices[c * 2].dev);
+						  &vgic_cpu->rd_iodev.dev);
 			kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-						  &devices[c * 2 + 1].dev);
+						  &vgic_cpu->sgi_iodev.dev);
 		}
-		kfree(devices);
-	} else {
-		kvm->arch.vgic.redist_iodevs = devices;
 	}
 
 	return ret;

From 42c8870f90098796c2ed7c9eaa3e7526407502a8 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:23 +0100
Subject: [PATCH 257/302] KVM: arm/arm64: vgic: Check return value for
 kvm_register_vgic_device

kvm_register_device_ops() can return an error, so lets check its return
value and propagate this up the call chain.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-kvm-device.c | 15 +++++++++------
 virt/kvm/arm/vgic/vgic-v2.c         | 11 ++++++++---
 virt/kvm/arm/vgic/vgic-v3.c         | 15 +++++++++++++--
 virt/kvm/arm/vgic/vgic.h            |  2 +-
 4 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 0130c4b147b7db..2f24f13c6c9040 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -210,20 +210,24 @@ static void vgic_destroy(struct kvm_device *dev)
 	kfree(dev);
 }
 
-void kvm_register_vgic_device(unsigned long type)
+int kvm_register_vgic_device(unsigned long type)
 {
+	int ret = -ENODEV;
+
 	switch (type) {
 	case KVM_DEV_TYPE_ARM_VGIC_V2:
-		kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-					KVM_DEV_TYPE_ARM_VGIC_V2);
+		ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+					      KVM_DEV_TYPE_ARM_VGIC_V2);
 		break;
 #ifdef CONFIG_KVM_ARM_VGIC_V3
 	case KVM_DEV_TYPE_ARM_VGIC_V3:
-		kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
-					KVM_DEV_TYPE_ARM_VGIC_V3);
+		ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+					      KVM_DEV_TYPE_ARM_VGIC_V3);
 		break;
 #endif
 	}
+
+	return ret;
 }
 
 /** vgic_attr_regs_access: allows user space to read/write VGIC registers
@@ -428,4 +432,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
 };
 
 #endif /* CONFIG_KVM_ARM_VGIC_V3 */
-
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index e31405ee5515b9..079bf670c4512e 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -332,20 +332,25 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
 	vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
 	kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
 
+	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+	if (ret) {
+		kvm_err("Cannot register GICv2 KVM device\n");
+		iounmap(kvm_vgic_global_state.vctrl_base);
+		return ret;
+	}
+
 	ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
 				     kvm_vgic_global_state.vctrl_base +
 					 resource_size(&info->vctrl),
 				     info->vctrl.start);
-
 	if (ret) {
 		kvm_err("Cannot map VCTRL into hyp\n");
+		kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
 		iounmap(kvm_vgic_global_state.vctrl_base);
 		return ret;
 	}
 
 	kvm_vgic_global_state.can_emulate_gicv2 = true;
-	kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-
 	kvm_vgic_global_state.vcpu_base = info->vcpu.start;
 	kvm_vgic_global_state.type = VGIC_V2;
 	kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 346b4ad12b497f..e48a22e9ee4047 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -296,6 +296,7 @@ int vgic_v3_map_resources(struct kvm *kvm)
 int vgic_v3_probe(const struct gic_kvm_info *info)
 {
 	u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+	int ret;
 
 	/*
 	 * The ListRegs field is 5 bits, but there is a architectural
@@ -319,12 +320,22 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
 	} else {
 		kvm_vgic_global_state.vcpu_base = info->vcpu.start;
 		kvm_vgic_global_state.can_emulate_gicv2 = true;
-		kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+		ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+		if (ret) {
+			kvm_err("Cannot register GICv2 KVM device.\n");
+			return ret;
+		}
 		kvm_info("vgic-v2@%llx\n", info->vcpu.start);
 	}
+	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+	if (ret) {
+		kvm_err("Cannot register GICv3 KVM device.\n");
+		kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+		return ret;
+	}
+
 	if (kvm_vgic_global_state.vcpu_base == 0)
 		kvm_info("disabling GICv2 emulation\n");
-	kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
 
 	kvm_vgic_global_state.vctrl_base = NULL;
 	kvm_vgic_global_state.type = VGIC_V3;
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 7b300ca370b7bd..c752152e82485e 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -124,7 +124,7 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm,
 }
 #endif
 
-void kvm_register_vgic_device(unsigned long type);
+int kvm_register_vgic_device(unsigned long type);
 int vgic_lazy_init(struct kvm *kvm);
 int vgic_init(struct kvm *kvm);
 

From 2b8ddd9337ee0d001b22507f95596648a1a90992 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:24 +0100
Subject: [PATCH 258/302] KVM: Extend struct kvm_msi to hold a 32-bit device ID

The ARM GICv3 ITS MSI controller requires a device ID to be able to
assign the proper interrupt vector. On real hardware, this ID is
sampled from the bus. To be able to emulate an ITS controller, extend
the KVM MSI interface to let userspace provide such a device ID. For
PCI devices, the device ID is simply the 16-bit bus-device-function
triplet, which should be easily available to the userland tool.

Also there is a new KVM capability which advertises whether the
current VM requires a device ID to be set along with the MSI data.
This flag is still reported as not available everywhere, later we will
enable it when ITS emulation is used.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Eric Auger <eric.auger@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt | 12 ++++++++++--
 include/uapi/linux/kvm.h          |  5 ++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 09efa9eb3926d5..65513119fee87d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2175,10 +2175,18 @@ struct kvm_msi {
 	__u32 address_hi;
 	__u32 data;
 	__u32 flags;
-	__u8  pad[16];
+	__u32 devid;
+	__u8  pad[12];
 };
 
-No flags are defined so far. The corresponding field must be 0.
+flags: KVM_MSI_VALID_DEVID: devid contains a valid value
+devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier
+       for the device that wrote the MSI message.
+       For PCI, this is usually a BFD identifier in the lower 16 bits.
+
+The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide
+the device ID. If this capability is not set, userland cannot rely on
+the kernel to allow the KVM_MSI_VALID_DEVID flag being set.
 
 
 4.71 KVM_CREATE_PIT2
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf475104ca1..7de96f5bb92c35 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_MSI_DEVID 129
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1024,12 +1025,14 @@ struct kvm_one_reg {
 	__u64 addr;
 };
 
+#define KVM_MSI_VALID_DEVID	(1U << 0)
 struct kvm_msi {
 	__u32 address_lo;
 	__u32 address_hi;
 	__u32 data;
 	__u32 flags;
-	__u8  pad[16];
+	__u32 devid;
+	__u8  pad[12];
 };
 
 struct kvm_arm_device_addr {

From b46f01ce4dcfdce636588fe2ef5035724c77f266 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:25 +0100
Subject: [PATCH 259/302] KVM: arm/arm64: Extend arch CAP checks to allow
 per-VM capabilities

KVM capabilities can be a per-VM property, though ARM/ARM64 currently
does not pass on the VM pointer to the architecture specific
capability handlers.
Add a "struct kvm*" parameter to those function to later allow proper
per-VM capability reporting.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Eric Auger <eric.auger@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/include/asm/kvm_host.h   | 2 +-
 arch/arm/kvm/arm.c                | 2 +-
 arch/arm64/include/asm/kvm_host.h | 2 +-
 arch/arm64/kvm/reset.c            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 58d0b69e7428ce..de338d93d11b90 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -272,7 +272,7 @@ static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
 	kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr);
 }
 
-static inline int kvm_arch_dev_ioctl_check_extension(long ext)
+static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 {
 	return 0;
 }
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 7cf266c502d6de..972075cc111cb8 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -201,7 +201,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_MAX_VCPUS;
 		break;
 	default:
-		r = kvm_arch_dev_ioctl_check_extension(ext);
+		r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
 		break;
 	}
 	return r;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 69d5cc2d2e17f9..3eda975837d0ff 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -47,7 +47,7 @@
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(long ext);
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
 void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
 
 struct kvm_arch {
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 79f324823340de..e95d4f68bf544f 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -65,7 +65,7 @@ static bool cpu_has_32bit_el1(void)
  * We currently assume that the number of HW registers is uniform
  * across all CPUs (see cpuinfo_sanity_check).
  */
-int kvm_arch_dev_ioctl_check_extension(long ext)
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 {
 	int r;
 

From 8a39d00670f0792c1186e442e1dd28fe0326f2ee Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:26 +0100
Subject: [PATCH 260/302] KVM: kvm_io_bus: Add kvm_io_bus_get_dev() call

The kvm_io_bus framework is a nice place of holding information about
various MMIO regions for kernel emulated devices.
Add a call to retrieve the kvm_io_device structure which is associated
with a certain MMIO address. This avoids to duplicate kvm_io_bus'
knowledge of MMIO regions without having to fake MMIO calls if a user
needs the device a certain MMIO address belongs to.
This will be used by the ITS emulation to get the associated ITS device
when someone triggers an MSI via an ioctl from userspace.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0640ee92b97872..614a98137c5f73 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			      struct kvm_io_device *dev);
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+					 gpa_t addr);
 
 #ifdef CONFIG_KVM_ASYNC_PF
 struct kvm_async_pf {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ef54b4c3179262..bd2eb92c5d0eb3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3496,6 +3496,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	return r;
 }
 
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+					 gpa_t addr)
+{
+	struct kvm_io_bus *bus;
+	int dev_idx, srcu_idx;
+	struct kvm_io_device *iodev = NULL;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+
+	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+	if (dev_idx < 0)
+		goto out_unlock;
+
+	iodev = bus->range[dev_idx].dev;
+
+out_unlock:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	return iodev;
+}
+EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+
 static struct notifier_block kvm_cpu_notifier = {
 	.notifier_call = kvm_cpu_hotplug,
 };

From 5dd4b924e390af426e424d5e52c1b4d1566af817 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:27 +0100
Subject: [PATCH 261/302] KVM: arm/arm64: vgic: Add refcounting for IRQs

In the moment our struct vgic_irq's are statically allocated at guest
creation time. So getting a pointer to an IRQ structure is trivial and
safe. LPIs are more dynamic, they can be mapped and unmapped at any time
during the guest's _runtime_.
In preparation for supporting LPIs we introduce reference counting for
those structures using the kernel's kref infrastructure.
Since private IRQs and SPIs are statically allocated, we avoid actually
refcounting them, since they would never be released anyway.
But we take provisions to increase the refcount when an IRQ gets onto a
VCPU list and decrease it when it gets removed. Also this introduces
vgic_put_irq(), which wraps kref_put and hides the release function from
the callers.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h           |  1 +
 virt/kvm/arm/vgic/vgic-init.c    |  2 ++
 virt/kvm/arm/vgic/vgic-mmio-v2.c |  8 +++++
 virt/kvm/arm/vgic/vgic-mmio-v3.c | 20 +++++++-----
 virt/kvm/arm/vgic/vgic-mmio.c    | 25 ++++++++++++++-
 virt/kvm/arm/vgic/vgic-v2.c      |  1 +
 virt/kvm/arm/vgic/vgic-v3.c      |  1 +
 virt/kvm/arm/vgic/vgic.c         | 52 +++++++++++++++++++++++++++++---
 virt/kvm/arm/vgic/vgic.h         |  1 +
 9 files changed, 99 insertions(+), 12 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 5142e2ab9f5e1d..450b4dab9a9f51 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -96,6 +96,7 @@ struct vgic_irq {
 	bool active;			/* not used for LPIs */
 	bool enabled;
 	bool hw;			/* Tied to HW IRQ */
+	struct kref refcount;		/* Used for LPIs */
 	u32 hwintid;			/* HW INTID number */
 	union {
 		u8 targets;			/* GICv2 target VCPUs mask */
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 90cae489c34cdd..ac3c1a5f7bf485 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -177,6 +177,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 		spin_lock_init(&irq->irq_lock);
 		irq->vcpu = NULL;
 		irq->target_vcpu = vcpu0;
+		kref_init(&irq->refcount);
 		if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
 			irq->targets = 0;
 		else
@@ -211,6 +212,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 		irq->vcpu = NULL;
 		irq->target_vcpu = vcpu;
 		irq->targets = 1U << vcpu->vcpu_id;
+		kref_init(&irq->refcount);
 		if (vgic_irq_is_sgi(i)) {
 			/* SGIs */
 			irq->enabled = 1;
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index a21393637e4b9b..4152348f5e4f14 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -102,6 +102,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
 		irq->source |= 1U << source_vcpu->vcpu_id;
 
 		vgic_queue_irq_unlock(source_vcpu->kvm, irq);
+		vgic_put_irq(source_vcpu->kvm, irq);
 	}
 }
 
@@ -116,6 +117,8 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
 		val |= (u64)irq->targets << (i * 8);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 
 	return val;
@@ -143,6 +146,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
 		irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
 
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
@@ -157,6 +161,8 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
 		val |= (u64)irq->source << (i * 8);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 	return val;
 }
@@ -178,6 +184,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
 			irq->pending = false;
 
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
@@ -201,6 +208,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
 		} else {
 			spin_unlock(&irq->irq_lock);
 		}
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index fc7b6c97acbb25..bfcafbd8fa0298 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -80,15 +80,17 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
 {
 	int intid = VGIC_ADDR_TO_INTID(addr, 64);
 	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+	unsigned long ret = 0;
 
 	if (!irq)
 		return 0;
 
 	/* The upper word is RAZ for us. */
-	if (addr & 4)
-		return 0;
+	if (!(addr & 4))
+		ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
 
-	return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+	vgic_put_irq(vcpu->kvm, irq);
+	return ret;
 }
 
 static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
@@ -96,15 +98,17 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
 				    unsigned long val)
 {
 	int intid = VGIC_ADDR_TO_INTID(addr, 64);
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
-	if (!irq)
-		return;
+	struct vgic_irq *irq;
 
 	/* The upper word is WI for us since we don't implement Aff3. */
 	if (addr & 4)
 		return;
 
+	irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+	if (!irq)
+		return;
+
 	spin_lock(&irq->irq_lock);
 
 	/* We only care about and preserve Aff0, Aff1 and Aff2. */
@@ -112,6 +116,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
 	irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
 
 	spin_unlock(&irq->irq_lock);
+	vgic_put_irq(vcpu->kvm, irq);
 }
 
 static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
@@ -445,5 +450,6 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
 		irq->pending = true;
 
 		vgic_queue_irq_unlock(vcpu->kvm, irq);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 9f6fab74dce7e1..5e79e0137cb6a6 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -56,6 +56,8 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
 
 		if (irq->enabled)
 			value |= (1U << i);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 
 	return value;
@@ -74,6 +76,8 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
 		spin_lock(&irq->irq_lock);
 		irq->enabled = true;
 		vgic_queue_irq_unlock(vcpu->kvm, irq);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
@@ -92,6 +96,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
 		irq->enabled = false;
 
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
@@ -108,6 +113,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
 
 		if (irq->pending)
 			value |= (1U << i);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 
 	return value;
@@ -129,6 +136,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
 			irq->soft_pending = true;
 
 		vgic_queue_irq_unlock(vcpu->kvm, irq);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
@@ -152,6 +160,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
 		}
 
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
@@ -168,6 +177,8 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
 
 		if (irq->active)
 			value |= (1U << i);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 
 	return value;
@@ -242,6 +253,7 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
 	for_each_set_bit(i, &val, len * 8) {
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 		vgic_mmio_change_active(vcpu, irq, false);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 	vgic_change_active_finish(vcpu, intid);
 }
@@ -257,6 +269,7 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
 	for_each_set_bit(i, &val, len * 8) {
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 		vgic_mmio_change_active(vcpu, irq, true);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 	vgic_change_active_finish(vcpu, intid);
 }
@@ -272,6 +285,8 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
 		val |= (u64)irq->priority << (i * 8);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 
 	return val;
@@ -298,6 +313,8 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
 		/* Narrow the priority range to what we actually support */
 		irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
 		spin_unlock(&irq->irq_lock);
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
@@ -313,6 +330,8 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
 
 		if (irq->config == VGIC_CONFIG_EDGE)
 			value |= (2U << (i * 2));
+
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 
 	return value;
@@ -326,7 +345,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
 	int i;
 
 	for (i = 0; i < len * 4; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+		struct vgic_irq *irq;
 
 		/*
 		 * The configuration cannot be changed for SGIs in general,
@@ -337,14 +356,18 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
 		if (intid + i < VGIC_NR_PRIVATE_IRQS)
 			continue;
 
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 		spin_lock(&irq->irq_lock);
+
 		if (test_bit(i * 2 + 1, &val)) {
 			irq->config = VGIC_CONFIG_EDGE;
 		} else {
 			irq->config = VGIC_CONFIG_LEVEL;
 			irq->pending = irq->line_level | irq->soft_pending;
 		}
+
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 079bf670c4512e..0bf6709d1006c4 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -124,6 +124,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 		}
 
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index e48a22e9ee4047..f0ac0642303c8d 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -113,6 +113,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 		}
 
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 69b61abefa19a8..fb19a554d090c0 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -64,6 +64,28 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 	return NULL;
 }
 
+static void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+	if (irq->intid < VGIC_MIN_LPI)
+		return;
+
+	kref_get(&irq->refcount);
+}
+
+/* The refcount should never drop to 0 at the moment. */
+static void vgic_irq_release(struct kref *ref)
+{
+	WARN_ON(1);
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+	if (irq->intid < VGIC_MIN_LPI)
+		return;
+
+	kref_put(&irq->refcount, vgic_irq_release);
+}
+
 /**
  * kvm_vgic_target_oracle - compute the target vcpu for an irq
  *
@@ -236,6 +258,11 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq)
 		goto retry;
 	}
 
+	/*
+	 * Grab a reference to the irq to reflect the fact that it is
+	 * now in the ap_list.
+	 */
+	vgic_get_irq_kref(irq);
 	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
 	irq->vcpu = vcpu;
 
@@ -269,14 +296,17 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	if (!irq)
 		return -EINVAL;
 
-	if (irq->hw != mapped_irq)
+	if (irq->hw != mapped_irq) {
+		vgic_put_irq(kvm, irq);
 		return -EINVAL;
+	}
 
 	spin_lock(&irq->irq_lock);
 
 	if (!vgic_validate_injection(irq, level)) {
 		/* Nothing to see here, move along... */
 		spin_unlock(&irq->irq_lock);
+		vgic_put_irq(kvm, irq);
 		return 0;
 	}
 
@@ -288,6 +318,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	}
 
 	vgic_queue_irq_unlock(kvm, irq);
+	vgic_put_irq(kvm, irq);
 
 	return 0;
 }
@@ -330,25 +361,28 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
 	irq->hwintid = phys_irq;
 
 	spin_unlock(&irq->irq_lock);
+	vgic_put_irq(vcpu->kvm, irq);
 
 	return 0;
 }
 
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
-
-	BUG_ON(!irq);
+	struct vgic_irq *irq;
 
 	if (!vgic_initialized(vcpu->kvm))
 		return -EAGAIN;
 
+	irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+	BUG_ON(!irq);
+
 	spin_lock(&irq->irq_lock);
 
 	irq->hw = false;
 	irq->hwintid = 0;
 
 	spin_unlock(&irq->irq_lock);
+	vgic_put_irq(vcpu->kvm, irq);
 
 	return 0;
 }
@@ -386,6 +420,15 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 			list_del(&irq->ap_list);
 			irq->vcpu = NULL;
 			spin_unlock(&irq->irq_lock);
+
+			/*
+			 * This vgic_put_irq call matches the
+			 * vgic_get_irq_kref in vgic_queue_irq_unlock,
+			 * where we added the LPI to the ap_list. As
+			 * we remove the irq from the list, we drop
+			 * also drop the refcount.
+			 */
+			vgic_put_irq(vcpu->kvm, irq);
 			continue;
 		}
 
@@ -614,6 +657,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 	spin_lock(&irq->irq_lock);
 	map_is_active = irq->hw && irq->active;
 	spin_unlock(&irq->irq_lock);
+	vgic_put_irq(vcpu->kvm, irq);
 
 	return map_is_active;
 }
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index c752152e82485e..5b79c340f17e2e 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -38,6 +38,7 @@ struct vgic_vmcr {
 
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
 void vgic_kick_vcpus(struct kvm *kvm);
 

From 645b9e49a8c053182aae0765d797f557f7a67eda Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:28 +0100
Subject: [PATCH 262/302] irqchip/gic-v3: Refactor and add GICv3 definitions

arm-gic-v3.h contains bit and register definitions for the GICv3 and ITS,
at least for the bits the we currently care about.
The ITS emulation needs more definitions, so add them and refactor
the memory attribute #defines to be more universally usable.
To avoid changing all users, we still provide some of the old definitons
defined with the help of the new macros.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 180 +++++++++++++++++++----------
 1 file changed, 120 insertions(+), 60 deletions(-)

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index bfbd707de390df..9442be7f2461b9 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -112,34 +112,60 @@
 #define GICR_WAKER_ProcessorSleep	(1U << 1)
 #define GICR_WAKER_ChildrenAsleep	(1U << 2)
 
-#define GICR_PROPBASER_NonShareable	(0U << 10)
-#define GICR_PROPBASER_InnerShareable	(1U << 10)
-#define GICR_PROPBASER_OuterShareable	(2U << 10)
-#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PROPBASER_nCnB		(0U << 7)
-#define GICR_PROPBASER_nC		(1U << 7)
-#define GICR_PROPBASER_RaWt		(2U << 7)
-#define GICR_PROPBASER_RaWb		(3U << 7)
-#define GICR_PROPBASER_WaWt		(4U << 7)
-#define GICR_PROPBASER_WaWb		(5U << 7)
-#define GICR_PROPBASER_RaWaWt		(6U << 7)
-#define GICR_PROPBASER_RaWaWb		(7U << 7)
-#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
-#define GICR_PROPBASER_IDBITS_MASK	(0x1f)
-
-#define GICR_PENDBASER_NonShareable	(0U << 10)
-#define GICR_PENDBASER_InnerShareable	(1U << 10)
-#define GICR_PENDBASER_OuterShareable	(2U << 10)
-#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PENDBASER_nCnB		(0U << 7)
-#define GICR_PENDBASER_nC		(1U << 7)
-#define GICR_PENDBASER_RaWt		(2U << 7)
-#define GICR_PENDBASER_RaWb		(3U << 7)
-#define GICR_PENDBASER_WaWt		(4U << 7)
-#define GICR_PENDBASER_WaWb		(5U << 7)
-#define GICR_PENDBASER_RaWaWt		(6U << 7)
-#define GICR_PENDBASER_RaWaWb		(7U << 7)
-#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+#define GIC_BASER_CACHE_nCnB		0ULL
+#define GIC_BASER_CACHE_SameAsInner	0ULL
+#define GIC_BASER_CACHE_nC		1ULL
+#define GIC_BASER_CACHE_RaWt		2ULL
+#define GIC_BASER_CACHE_RaWb		3ULL
+#define GIC_BASER_CACHE_WaWt		4ULL
+#define GIC_BASER_CACHE_WaWb		5ULL
+#define GIC_BASER_CACHE_RaWaWt		6ULL
+#define GIC_BASER_CACHE_RaWaWb		7ULL
+#define GIC_BASER_CACHE_MASK		7ULL
+#define GIC_BASER_NonShareable		0ULL
+#define GIC_BASER_InnerShareable	1ULL
+#define GIC_BASER_OuterShareable	2ULL
+#define GIC_BASER_SHAREABILITY_MASK	3ULL
+
+#define GIC_BASER_CACHEABILITY(reg, inner_outer, type)			\
+	(GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+
+#define GIC_BASER_SHAREABILITY(reg, type)				\
+	(GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+
+#define GICR_PROPBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT		(7)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT		(56)
+#define GICR_PROPBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+#define GICR_PROPBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PROPBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+#define GICR_PROPBASER_IDBITS_MASK			(0x1f)
+
+#define GICR_PENDBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT		(7)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT		(56)
+#define GICR_PENDBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+#define GICR_PENDBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PENDBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+#define GICR_PENDBASER_PTZ				BIT_ULL(62)
 
 /*
  * Re-Distributor registers, offsets from SGI_base
@@ -175,59 +201,74 @@
 #define GITS_CWRITER			0x0088
 #define GITS_CREADR			0x0090
 #define GITS_BASER			0x0100
+#define GITS_IDREGS_BASE		0xffd0
+#define GITS_PIDR0			0xffe0
+#define GITS_PIDR1			0xffe4
 #define GITS_PIDR2			GICR_PIDR2
+#define GITS_PIDR4			0xffd0
+#define GITS_CIDR0			0xfff0
+#define GITS_CIDR1			0xfff4
+#define GITS_CIDR2			0xfff8
+#define GITS_CIDR3			0xfffc
 
 #define GITS_TRANSLATER			0x10040
 
 #define GITS_CTLR_ENABLE		(1U << 0)
 #define GITS_CTLR_QUIESCENT		(1U << 31)
 
+#define GITS_TYPER_PLPIS		(1UL << 0)
+#define GITS_TYPER_IDBITS_SHIFT		8
 #define GITS_TYPER_DEVBITS_SHIFT	13
 #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
 #define GITS_TYPER_PTA			(1UL << 19)
-
-#define GITS_CBASER_VALID		(1UL << 63)
-#define GITS_CBASER_nCnB		(0UL << 59)
-#define GITS_CBASER_nC			(1UL << 59)
-#define GITS_CBASER_RaWt		(2UL << 59)
-#define GITS_CBASER_RaWb		(3UL << 59)
-#define GITS_CBASER_WaWt		(4UL << 59)
-#define GITS_CBASER_WaWb		(5UL << 59)
-#define GITS_CBASER_RaWaWt		(6UL << 59)
-#define GITS_CBASER_RaWaWb		(7UL << 59)
-#define GITS_CBASER_CACHEABILITY_MASK	(7UL << 59)
-#define GITS_CBASER_NonShareable	(0UL << 10)
-#define GITS_CBASER_InnerShareable	(1UL << 10)
-#define GITS_CBASER_OuterShareable	(2UL << 10)
-#define GITS_CBASER_SHAREABILITY_MASK	(3UL << 10)
+#define GITS_TYPER_HWCOLLCNT_SHIFT	24
+
+#define GITS_CBASER_VALID			(1UL << 63)
+#define GITS_CBASER_SHAREABILITY_SHIFT		(10)
+#define GITS_CBASER_INNER_CACHEABILITY_SHIFT	(59)
+#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT	(53)
+#define GITS_CBASER_SHAREABILITY_MASK					\
+	GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+#define GITS_CBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+#define GITS_CBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+
+#define GITS_CBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
 
 #define GITS_BASER_NR_REGS		8
 
-#define GITS_BASER_VALID		(1UL << 63)
-#define GITS_BASER_nCnB			(0UL << 59)
-#define GITS_BASER_nC			(1UL << 59)
-#define GITS_BASER_RaWt			(2UL << 59)
-#define GITS_BASER_RaWb			(3UL << 59)
-#define GITS_BASER_WaWt			(4UL << 59)
-#define GITS_BASER_WaWb			(5UL << 59)
-#define GITS_BASER_RaWaWt		(6UL << 59)
-#define GITS_BASER_RaWaWb		(7UL << 59)
-#define GITS_BASER_CACHEABILITY_MASK	(7UL << 59)
-#define GITS_BASER_TYPE_SHIFT		(56)
+#define GITS_BASER_VALID			(1UL << 63)
+#define GITS_BASER_INDIRECT			(1ULL << 62)
+#define GITS_BASER_INNER_CACHEABILITY_SHIFT	(59)
+#define GITS_BASER_OUTER_CACHEABILITY_SHIFT	(53)
+#define GITS_BASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+#define GITS_BASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+#define GITS_BASER_SHAREABILITY_MASK					\
+	GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+
+#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_TYPE_SHIFT			(56)
 #define GITS_BASER_TYPE(r)		(((r) >> GITS_BASER_TYPE_SHIFT) & 7)
-#define GITS_BASER_ENTRY_SIZE_SHIFT	(48)
+#define GITS_BASER_ENTRY_SIZE_SHIFT		(48)
 #define GITS_BASER_ENTRY_SIZE(r)	((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1)
-#define GITS_BASER_NonShareable		(0UL << 10)
-#define GITS_BASER_InnerShareable	(1UL << 10)
-#define GITS_BASER_OuterShareable	(2UL << 10)
 #define GITS_BASER_SHAREABILITY_SHIFT	(10)
-#define GITS_BASER_SHAREABILITY_MASK	(3UL << GITS_BASER_SHAREABILITY_SHIFT)
+#define GITS_BASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
 #define GITS_BASER_PAGE_SIZE_SHIFT	(8)
 #define GITS_BASER_PAGE_SIZE_4K		(0UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGE_SIZE_16K	(1UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGE_SIZE_64K	(2UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGE_SIZE_MASK	(3UL << GITS_BASER_PAGE_SIZE_SHIFT)
 #define GITS_BASER_PAGES_MAX		256
+#define GITS_BASER_NR_PAGES(r)		(((r) & 0xff) + 1)
 
 #define GITS_BASER_TYPE_NONE		0
 #define GITS_BASER_TYPE_DEVICE		1
@@ -243,7 +284,10 @@
  */
 #define GITS_CMD_MAPD			0x08
 #define GITS_CMD_MAPC			0x09
-#define GITS_CMD_MAPVI			0x0a
+#define GITS_CMD_MAPTI			0x0a
+/* older GIC documentation used MAPVI for this command */
+#define GITS_CMD_MAPVI			GITS_CMD_MAPTI
+#define GITS_CMD_MAPI			0x0b
 #define GITS_CMD_MOVI			0x01
 #define GITS_CMD_DISCARD		0x0f
 #define GITS_CMD_INV			0x0c
@@ -253,6 +297,22 @@
 #define GITS_CMD_CLEAR			0x04
 #define GITS_CMD_SYNC			0x05
 
+/*
+ * ITS error numbers
+ */
+#define E_ITS_MOVI_UNMAPPED_INTERRUPT		0x010107
+#define E_ITS_MOVI_UNMAPPED_COLLECTION		0x010109
+#define E_ITS_CLEAR_UNMAPPED_INTERRUPT		0x010507
+#define E_ITS_MAPD_DEVICE_OOR			0x010801
+#define E_ITS_MAPC_PROCNUM_OOR			0x010902
+#define E_ITS_MAPC_COLLECTION_OOR		0x010903
+#define E_ITS_MAPTI_UNMAPPED_DEVICE		0x010a04
+#define E_ITS_MAPTI_PHYSICALID_OOR		0x010a06
+#define E_ITS_INV_UNMAPPED_INTERRUPT		0x010c07
+#define E_ITS_INVALL_UNMAPPED_COLLECTION	0x010d09
+#define E_ITS_MOVALL_PROCNUM_OOR		0x010e01
+#define E_ITS_DISCARD_UNMAPPED_INTERRUPT	0x010f07
+
 /*
  * CPU interface registers
  */

From 0aa1de57319c4e023187aca0d59dd593a96459a8 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:29 +0100
Subject: [PATCH 263/302] KVM: arm64: vgic: Handle ITS related GICv3
 redistributor registers

In the GICv3 redistributor there are the PENDBASER and PROPBASER
registers which we did not emulate so far, as they only make sense
when having an ITS. In preparation for that emulate those MMIO
accesses by storing the 64-bit data written into it into a variable
which we later read in the ITS emulation.
We also sanitise the registers, making sure RES0 regions are respected
and checking for valid memory attributes.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h           |  13 +++
 virt/kvm/arm/vgic/vgic-mmio-v3.c | 153 ++++++++++++++++++++++++++++++-
 virt/kvm/arm/vgic/vgic-mmio.h    |   8 ++
 virt/kvm/arm/vgic/vgic-v3.c      |  11 ++-
 4 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 450b4dab9a9f51..df2dec5ef62012 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -146,6 +146,14 @@ struct vgic_dist {
 	struct vgic_irq		*spis;
 
 	struct vgic_io_device	dist_iodev;
+
+	/*
+	 * Contains the attributes and gpa of the LPI configuration table.
+	 * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
+	 * one address across all redistributors.
+	 * GICv3 spec: 6.1.2 "LPI Configuration tables"
+	 */
+	u64			propbaser;
 };
 
 struct vgic_v2_cpu_if {
@@ -200,6 +208,11 @@ struct vgic_cpu {
 	 */
 	struct vgic_io_device	rd_iodev;
 	struct vgic_io_device	sgi_iodev;
+
+	/* Contains the attributes and gpa of the LPI pending tables. */
+	u64 pendbaser;
+
+	bool lpis_enabled;
 };
 
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index bfcafbd8fa0298..278bfbb36ef986 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -29,6 +29,19 @@ static unsigned long extract_bytes(unsigned long data, unsigned int offset,
 	return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
 }
 
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+static u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+			    unsigned long val)
+{
+	int lower = (offset & 4) * 8;
+	int upper = lower + 8 * len - 1;
+
+	reg &= ~GENMASK_ULL(upper, lower);
+	val &= GENMASK_ULL(len * 8 - 1, 0);
+
+	return reg | ((u64)val << lower);
+}
+
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
 					    gpa_t addr, unsigned int len)
 {
@@ -152,6 +165,142 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+	switch (field) {
+	case GIC_BASER_OuterShareable:
+		return GIC_BASER_InnerShareable;
+	default:
+		return field;
+	}
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+	switch (field) {
+	case GIC_BASER_CACHE_nCnB:
+	case GIC_BASER_CACHE_nC:
+		return GIC_BASER_CACHE_RaWb;
+	default:
+		return field;
+	}
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+	switch (field) {
+	case GIC_BASER_CACHE_SameAsInner:
+	case GIC_BASER_CACHE_nC:
+		return field;
+	default:
+		return GIC_BASER_CACHE_nC;
+	}
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+			u64 (*sanitise_fn)(u64))
+{
+	u64 field = (reg & field_mask) >> field_shift;
+
+	field = sanitise_fn(field) << field_shift;
+	return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK						\
+	(GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK						\
+	(BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |	\
+	 GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+				  GICR_PENDBASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+				  GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+				  GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	reg &= ~PENDBASER_RES0_MASK;
+	reg &= ~GENMASK_ULL(51, 48);
+
+	return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+				  GICR_PROPBASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+				  GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+				  GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	reg &= ~PROPBASER_RES0_MASK;
+	reg &= ~GENMASK_ULL(51, 48);
+	return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+	return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 propbaser = dist->propbaser;
+
+	/* Storing a value with LPIs already enabled is undefined */
+	if (vgic_cpu->lpis_enabled)
+		return;
+
+	propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+	propbaser = vgic_sanitise_propbaser(propbaser);
+
+	dist->propbaser = propbaser;
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+	return extract_bytes(vgic_cpu->pendbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 pendbaser = vgic_cpu->pendbaser;
+
+	/* Storing a value with LPIs already enabled is undefined */
+	if (vgic_cpu->lpis_enabled)
+		return;
+
+	pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+	pendbaser = vgic_sanitise_pendbaser(pendbaser);
+
+	vgic_cpu->pendbaser = pendbaser;
+}
+
 /*
  * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
  * redistributors, while SPIs are covered by registers in the distributor
@@ -232,10 +381,10 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
 		vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+		vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+		vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
 		vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 850901482aecd9..71aa39d4cfdfa6 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -147,4 +147,12 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
 
 unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
 
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+			u64 (*sanitise_fn)(u64));
+#endif
+
 #endif
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index f0ac0642303c8d..6f8f31f910e731 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -191,6 +191,11 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 	vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
 }
 
+#define INITIAL_PENDBASER_VALUE						  \
+	(GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)		| \
+	GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)	| \
+	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
 void vgic_v3_enable(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -208,10 +213,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
 	 * way, so we force SRE to 1 to demonstrate this to the guest.
 	 * This goes with the spec allowing the value to be RAO/WI.
 	 */
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
 		vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
-	else
+		vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+	} else {
 		vgic_v3->vgic_sre = 0;
+	}
 
 	/* Get the show on the road... */
 	vgic_v3->vgic_hcr = ICH_HCR_EN;

From 59c5ab40989afa5aba9c4a0918a5ed910a917422 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:30 +0100
Subject: [PATCH 264/302] KVM: arm64: vgic-its: Introduce ITS emulation file
 with MMIO framework

The ARM GICv3 ITS emulation code goes into a separate file, but needs
to be connected to the GICv3 emulation, of which it is an option.
The ITS MMIO handlers require the respective ITS pointer to be passed in,
so we amend the existing VGIC MMIO framework to let it cope with that.
Also we introduce the basic ITS data structure and initialize it, but
don't return any success yet, as we are not yet ready for the show.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h           |  22 ++++++-
 virt/kvm/arm/vgic/vgic-its.c     | 103 +++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic-mmio-v3.c |  40 +++++++++++-
 virt/kvm/arm/vgic/vgic-mmio.c    |  37 ++++++++---
 virt/kvm/arm/vgic/vgic-mmio.h    |  17 +++--
 virt/kvm/arm/vgic/vgic.h         |   7 +++
 6 files changed, 213 insertions(+), 13 deletions(-)
 create mode 100644 virt/kvm/arm/vgic/vgic-its.c

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index df2dec5ef62012..685f33975ce4b3 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -108,15 +108,35 @@ struct vgic_irq {
 };
 
 struct vgic_register_region;
+struct vgic_its;
+
+enum iodev_type {
+	IODEV_CPUIF,
+	IODEV_DIST,
+	IODEV_REDIST,
+	IODEV_ITS
+};
 
 struct vgic_io_device {
 	gpa_t base_addr;
-	struct kvm_vcpu *redist_vcpu;
+	union {
+		struct kvm_vcpu *redist_vcpu;
+		struct vgic_its *its;
+	};
 	const struct vgic_register_region *regions;
+	enum iodev_type iodev_type;
 	int nr_regions;
 	struct kvm_io_device dev;
 };
 
+struct vgic_its {
+	/* The base address of the ITS control register frame */
+	gpa_t			vgic_its_base;
+
+	bool			enabled;
+	struct vgic_io_device	iodev;
+};
+
 struct vgic_dist {
 	bool			in_kernel;
 	bool			ready;
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
new file mode 100644
index 00000000000000..4654d6edf6a6de
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -0,0 +1,103 @@
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc)		\
+{								\
+	.reg_offset = off,					\
+	.len = length,						\
+	.access_flags = acc,					\
+	.its_read = rd,						\
+	.its_write = wr,					\
+}
+
+static unsigned long its_mmio_read_raz(struct kvm *kvm, struct vgic_its *its,
+				       gpa_t addr, unsigned int len)
+{
+	return 0;
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+			      gpa_t addr, unsigned int len, unsigned long val)
+{
+	/* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+	REGISTER_ITS_DESC(GITS_CTLR,
+		its_mmio_read_raz, its_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_IIDR,
+		its_mmio_read_raz, its_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_TYPER,
+		its_mmio_read_raz, its_mmio_write_wi, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_CBASER,
+		its_mmio_read_raz, its_mmio_write_wi, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_CWRITER,
+		its_mmio_read_raz, its_mmio_write_wi, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_CREADR,
+		its_mmio_read_raz, its_mmio_write_wi, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_BASER,
+		its_mmio_read_raz, its_mmio_write_wi, 0x40,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+		its_mmio_read_raz, its_mmio_write_wi, 0x30,
+		VGIC_ACCESS_32bit),
+};
+
+static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
+{
+	struct vgic_io_device *iodev = &its->iodev;
+	int ret;
+
+	if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base))
+		return -ENXIO;
+
+	iodev->regions = its_registers;
+	iodev->nr_regions = ARRAY_SIZE(its_registers);
+	kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+	iodev->base_addr = its->vgic_its_base;
+	iodev->iodev_type = IODEV_ITS;
+	iodev->its = its;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+				      KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 278bfbb36ef986..b92b7d6cabe6b0 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -42,6 +42,16 @@ static u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
 	return reg | ((u64)val << lower);
 }
 
+bool vgic_has_its(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+		return false;
+
+	return false;
+}
+
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
 					    gpa_t addr, unsigned int len)
 {
@@ -132,6 +142,32 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
 	vgic_put_irq(vcpu->kvm, irq);
 }
 
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+	return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	bool was_enabled = vgic_cpu->lpis_enabled;
+
+	if (!vgic_has_its(vcpu->kvm))
+		return;
+
+	vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+	if (!was_enabled && vgic_cpu->lpis_enabled) {
+		/* Eventually do something */
+	}
+}
+
 static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
 					      gpa_t addr, unsigned int len)
 {
@@ -372,7 +408,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
 
 static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
 	REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+		vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
 		VGIC_ACCESS_32bit),
 	REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
 		vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
@@ -450,6 +486,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 
 		kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
 		rd_dev->base_addr = rd_base;
+		rd_dev->iodev_type = IODEV_REDIST;
 		rd_dev->regions = vgic_v3_rdbase_registers;
 		rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
 		rd_dev->redist_vcpu = vcpu;
@@ -464,6 +501,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
 
 		kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
 		sgi_dev->base_addr = sgi_base;
+		sgi_dev->iodev_type = IODEV_REDIST;
 		sgi_dev->regions = vgic_v3_sgibase_registers;
 		sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
 		sgi_dev->redist_vcpu = vcpu;
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 5e79e0137cb6a6..26be827bbfcc54 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -473,8 +473,7 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 {
 	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
 	const struct vgic_register_region *region;
-	struct kvm_vcpu *r_vcpu;
-	unsigned long data;
+	unsigned long data = 0;
 
 	region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
 				       addr - iodev->base_addr);
@@ -483,8 +482,20 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 		return 0;
 	}
 
-	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-	data = region->read(r_vcpu, addr, len);
+	switch (iodev->iodev_type) {
+	case IODEV_CPUIF:
+		return 1;
+	case IODEV_DIST:
+		data = region->read(vcpu, addr, len);
+		break;
+	case IODEV_REDIST:
+		data = region->read(iodev->redist_vcpu, addr, len);
+		break;
+	case IODEV_ITS:
+		data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+		break;
+	}
+
 	vgic_data_host_to_mmio_bus(val, len, data);
 	return 0;
 }
@@ -494,7 +505,6 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 {
 	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
 	const struct vgic_register_region *region;
-	struct kvm_vcpu *r_vcpu;
 	unsigned long data = vgic_data_mmio_bus_to_host(val, len);
 
 	region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
@@ -505,8 +515,20 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 	if (!check_region(region, addr, len))
 		return 0;
 
-	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-	region->write(r_vcpu, addr, len, data);
+	switch (iodev->iodev_type) {
+	case IODEV_CPUIF:
+		break;
+	case IODEV_DIST:
+		region->write(vcpu, addr, len, data);
+		break;
+	case IODEV_REDIST:
+		region->write(iodev->redist_vcpu, addr, len, data);
+		break;
+	case IODEV_ITS:
+		region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+		break;
+	}
+
 	return 0;
 }
 
@@ -536,6 +558,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
 	}
 
 	io_device->base_addr = dist_base_address;
+	io_device->iodev_type = IODEV_DIST;
 	io_device->redist_vcpu = NULL;
 
 	mutex_lock(&kvm->slots_lock);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 71aa39d4cfdfa6..366d66378732b0 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -21,10 +21,19 @@ struct vgic_register_region {
 	unsigned int len;
 	unsigned int bits_per_irq;
 	unsigned int access_flags;
-	unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
-			      unsigned int len);
-	void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
-		      unsigned long val);
+	union {
+		unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+				      unsigned int len);
+		unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+					  gpa_t addr, unsigned int len);
+	};
+	union {
+		void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+			      unsigned int len, unsigned long val);
+		void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+				  gpa_t addr, unsigned int len,
+				  unsigned long val);
+	};
 };
 
 extern struct kvm_io_device_ops kvm_io_gic_ops;
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 5b79c340f17e2e..31807c166d2a2f 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -72,6 +72,7 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
 int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+bool vgic_has_its(struct kvm *kvm);
 #else
 static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
 {
@@ -123,6 +124,12 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm,
 {
 	return -ENODEV;
 }
+
+static inline bool vgic_has_its(struct kvm *kvm)
+{
+	return false;
+}
+
 #endif
 
 int kvm_register_vgic_device(unsigned long type);

From 1085fdc68c6097244627a02a56bd2d8fe58a1a9c Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:31 +0100
Subject: [PATCH 265/302] KVM: arm64: vgic-its: Introduce new KVM ITS device

Introduce a new KVM device that represents an ARM Interrupt Translation
Service (ITS) controller. Since there can be multiple of this per guest,
we can't piggy back on the existing GICv3 distributor device, but create
a new type of KVM device.
On the KVM_CREATE_DEVICE ioctl we allocate and initialize the ITS data
structure and store the pointer in the kvm_device data.
Upon an explicit init ioctl from userland (after having setup the MMIO
address) we register the handlers with the kvm_io_bus framework.
Any reference to an ITS thus has to go via this interface.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 .../virtual/kvm/devices/arm-vgic.txt          |  25 +++-
 arch/arm/kvm/arm.c                            |   1 +
 arch/arm64/include/uapi/asm/kvm.h             |   2 +
 include/kvm/arm_vgic.h                        |   3 +
 include/uapi/linux/kvm.h                      |   2 +
 virt/kvm/arm/vgic/vgic-its.c                  | 135 ++++++++++++++++++
 virt/kvm/arm/vgic/vgic-kvm-device.c           |   4 +-
 virt/kvm/arm/vgic/vgic-mmio-v3.c              |   2 +-
 virt/kvm/arm/vgic/vgic.h                      |   3 +
 9 files changed, 168 insertions(+), 9 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 59541d49e15c01..89182f80cc7f21 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC)
 Device types supported:
   KVM_DEV_TYPE_ARM_VGIC_V2     ARM Generic Interrupt Controller v2.0
   KVM_DEV_TYPE_ARM_VGIC_V3     ARM Generic Interrupt Controller v3.0
+  KVM_DEV_TYPE_ARM_VGIC_ITS    ARM Interrupt Translation Service Controller
 
-Only one VGIC instance may be instantiated through either this API or the
-legacy KVM_CREATE_IRQCHIP api.  The created VGIC will act as the VM interrupt
-controller, requiring emulated user-space devices to inject interrupts to the
-VGIC instead of directly to CPUs.
+Only one VGIC instance of the V2/V3 types above may be instantiated through
+either this API or the legacy KVM_CREATE_IRQCHIP api.  The created VGIC will
+act as the VM interrupt controller, requiring emulated user-space devices to
+inject interrupts to the VGIC instead of directly to CPUs.
 
 Creating a guest GICv3 device requires a host GICv3 as well.
 GICv3 implementations with hardware compatibility support allow a guest GICv2
 as well.
 
+Creating a virtual ITS controller requires a host GICv3 (but does not depend
+on having physical ITS controllers).
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
 Groups:
   KVM_DEV_ARM_VGIC_GRP_ADDR
   Attributes:
@@ -39,6 +45,13 @@ Groups:
       Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
       This address needs to be 64K aligned.
 
+    KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
+      Base address in the guest physical address space of the GICv3 ITS
+      control register frame. The ITS allows MSI(-X) interrupts to be
+      injected into guests. This extension is optional. If the kernel
+      does not support the ITS, the call returns -ENODEV.
+      Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
+      This address needs to be 64K aligned and the region covers 128K.
 
   KVM_DEV_ARM_VGIC_GRP_DIST_REGS
   Attributes:
@@ -109,8 +122,8 @@ Groups:
   KVM_DEV_ARM_VGIC_GRP_CTRL
   Attributes:
     KVM_DEV_ARM_VGIC_CTRL_INIT
-      request the initialization of the VGIC, no additional parameter in
-      kvm_device_attr.addr.
+      request the initialization of the VGIC or ITS, no additional parameter
+      in kvm_device_attr.addr.
   Errors:
     -ENXIO: VGIC not properly configured as required prior to calling
      this attribute
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 972075cc111cb8..fb4661cf896ecd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -20,6 +20,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/list.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f209ea151dca8a..3051f86a9b5f4a 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -87,9 +87,11 @@ struct kvm_regs {
 /* Supported VGICv3 address types  */
 #define KVM_VGIC_V3_ADDR_TYPE_DIST	2
 #define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+#define KVM_VGIC_ITS_ADDR_TYPE		4
 
 #define KVM_VGIC_V3_DIST_SIZE		SZ_64K
 #define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE		(2 * SZ_64K)
 
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT		1 /* CPU running a 32bit VM */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 685f33975ce4b3..8609faced83e51 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -134,6 +134,7 @@ struct vgic_its {
 	gpa_t			vgic_its_base;
 
 	bool			enabled;
+	bool			initialized;
 	struct vgic_io_device	iodev;
 };
 
@@ -167,6 +168,8 @@ struct vgic_dist {
 
 	struct vgic_io_device	dist_iodev;
 
+	bool			has_its;
+
 	/*
 	 * Contains the attributes and gpa of the LPI configuration table.
 	 * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7de96f5bb92c35..d8c4c324cfae53 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1077,6 +1077,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
 	KVM_DEV_TYPE_ARM_VGIC_V3,
 #define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
+	KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
 	KVM_DEV_TYPE_MAX,
 };
 
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 4654d6edf6a6de..6b47b367469027 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -21,6 +21,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
+#include <linux/uaccess.h>
 
 #include <linux/irqchip/arm-gic-v3.h>
 
@@ -84,6 +85,9 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
 	struct vgic_io_device *iodev = &its->iodev;
 	int ret;
 
+	if (its->initialized)
+		return 0;
+
 	if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base))
 		return -ENXIO;
 
@@ -99,5 +103,136 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
 				      KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
 	mutex_unlock(&kvm->slots_lock);
 
+	if (!ret)
+		its->initialized = true;
+
 	return ret;
 }
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+	struct vgic_its *its;
+
+	if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+		return -ENODEV;
+
+	its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+	if (!its)
+		return -ENOMEM;
+
+	its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+	dev->kvm->arch.vgic.has_its = true;
+	its->initialized = false;
+	its->enabled = false;
+
+	dev->private = its;
+
+	return 0;
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+	struct vgic_its *its = kvm_dev->private;
+
+	kfree(its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_ITS_ADDR_TYPE:
+			return 0;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return 0;
+		}
+		break;
+	}
+	return -ENXIO;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	struct vgic_its *its = dev->private;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		unsigned long type = (unsigned long)attr->attr;
+		u64 addr;
+
+		if (type != KVM_VGIC_ITS_ADDR_TYPE)
+			return -ENODEV;
+
+		if (its->initialized)
+			return -EBUSY;
+
+		if (copy_from_user(&addr, uaddr, sizeof(addr)))
+			return -EFAULT;
+
+		ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+					addr, SZ_64K);
+		if (ret)
+			return ret;
+
+		its->vgic_its_base = addr;
+
+		return 0;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return vgic_its_init_its(dev->kvm, its);
+		}
+		break;
+	}
+	return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		struct vgic_its *its = dev->private;
+		u64 addr = its->vgic_its_base;
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		unsigned long type = (unsigned long)attr->attr;
+
+		if (type != KVM_VGIC_ITS_ADDR_TYPE)
+			return -ENODEV;
+
+		if (copy_to_user(uaddr, &addr, sizeof(addr)))
+			return -EFAULT;
+		break;
+	default:
+		return -ENXIO;
+	}
+	}
+
+	return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+	.name = "kvm-arm-vgic-its",
+	.create = vgic_its_create,
+	.destroy = vgic_its_destroy,
+	.set_attr = vgic_its_set_attr,
+	.get_attr = vgic_its_get_attr,
+	.has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+	return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+				       KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 2f24f13c6c9040..561d2ba96a4f2e 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -21,8 +21,8 @@
 
 /* common helpers */
 
-static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-			     phys_addr_t addr, phys_addr_t alignment)
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+		      phys_addr_t addr, phys_addr_t alignment)
 {
 	if (addr & ~KVM_PHYS_MASK)
 		return -E2BIG;
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index b92b7d6cabe6b0..a5c35050c7864d 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -49,7 +49,7 @@ bool vgic_has_its(struct kvm *kvm)
 	if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
 		return false;
 
-	return false;
+	return dist->has_its;
 }
 
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 31807c166d2a2f..8192a293f119cb 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -42,6 +42,9 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
 void vgic_kick_vcpus(struct kvm *kvm);
 
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+		      phys_addr_t addr, phys_addr_t alignment);
+
 void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
 void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);

From 424c33830f53f248a68da125e70d9a4d95a8e010 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:32 +0100
Subject: [PATCH 266/302] KVM: arm64: vgic-its: Implement basic ITS register
 handlers

Add emulation for some basic MMIO registers used in the ITS emulation.
This includes:
- GITS_{CTLR,TYPER,IIDR}
- ID registers
- GITS_{CBASER,CREADR,CWRITER}
  (which implement the ITS command buffer handling)
- GITS_BASER<n>

Most of the handlers are pretty straight forward, only the CWRITER
handler is a bit more involved by taking the new its_cmd mutex and
then iterating over the command buffer.
The registers holding base addresses and attributes are sanitised before
storing them.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h           |  16 ++
 virt/kvm/arm/vgic/vgic-its.c     | 399 +++++++++++++++++++++++++++++--
 virt/kvm/arm/vgic/vgic-mmio-v3.c |   8 +-
 virt/kvm/arm/vgic/vgic-mmio.h    |   6 +
 virt/kvm/arm/vgic/vgic.c         |  12 +-
 5 files changed, 420 insertions(+), 21 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 8609faced83e51..61867492d361d0 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -22,6 +22,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <kvm/iodev.h>
+#include <linux/list.h>
 
 #define VGIC_V3_MAX_CPUS	255
 #define VGIC_V2_MAX_CPUS	8
@@ -136,6 +137,21 @@ struct vgic_its {
 	bool			enabled;
 	bool			initialized;
 	struct vgic_io_device	iodev;
+
+	/* These registers correspond to GITS_BASER{0,1} */
+	u64			baser_device_table;
+	u64			baser_coll_table;
+
+	/* Protects the command queue */
+	struct mutex		cmd_lock;
+	u64			cbaser;
+	u32			creadr;
+	u32			cwriter;
+
+	/* Protects the device and collection lists */
+	struct mutex		its_lock;
+	struct list_head	device_list;
+	struct list_head	collection_list;
 };
 
 struct vgic_dist {
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 6b47b367469027..11cfe2f12c6cda 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -21,6 +21,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
+#include <linux/list.h>
 #include <linux/uaccess.h>
 
 #include <linux/irqchip/arm-gic-v3.h>
@@ -32,6 +33,329 @@
 #include "vgic.h"
 #include "vgic-mmio.h"
 
+struct its_device {
+	struct list_head dev_list;
+
+	/* the head for the list of ITTEs */
+	struct list_head itt_head;
+	u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+	struct list_head coll_list;
+
+	u32 collection_id;
+	u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+				((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_itte {
+	struct list_head itte_list;
+
+	struct its_collection *collection;
+	u32 lpi;
+	u32 event_id;
+};
+
+/*
+ * We only implement 48 bits of PA at the moment, although the ITS
+ * supports more. Let's be restrictive here.
+ */
+#define CBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+					     struct vgic_its *its,
+					     gpa_t addr, unsigned int len)
+{
+	u32 reg = 0;
+
+	mutex_lock(&its->cmd_lock);
+	if (its->creadr == its->cwriter)
+		reg |= GITS_CTLR_QUIESCENT;
+	if (its->enabled)
+		reg |= GITS_CTLR_ENABLE;
+	mutex_unlock(&its->cmd_lock);
+
+	return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	its->enabled = !!(val & GITS_CTLR_ENABLE);
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+					      struct vgic_its *its,
+					      gpa_t addr, unsigned int len)
+{
+	u64 reg = GITS_TYPER_PLPIS;
+
+	/*
+	 * We use linear CPU numbers for redistributor addressing,
+	 * so GITS_TYPER.PTA is 0.
+	 * Also we force all PROPBASER registers to be the same, so
+	 * CommonLPIAff is 0 as well.
+	 * To avoid memory waste in the guest, we keep the number of IDBits and
+	 * DevBits low - as least for the time being.
+	 */
+	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
+	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+
+	return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+					     struct vgic_its *its,
+					     gpa_t addr, unsigned int len)
+{
+	return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+					       struct vgic_its *its,
+					       gpa_t addr, unsigned int len)
+{
+	switch (addr & 0xffff) {
+	case GITS_PIDR0:
+		return 0x92;	/* part number, bits[7:0] */
+	case GITS_PIDR1:
+		return 0xb4;	/* part number, bits[11:8] */
+	case GITS_PIDR2:
+		return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+	case GITS_PIDR4:
+		return 0x40;	/* This is a 64K software visible page */
+	/* The following are the ID registers for (any) GIC. */
+	case GITS_CIDR0:
+		return 0x0d;
+	case GITS_CIDR1:
+		return 0xf0;
+	case GITS_CIDR2:
+		return 0x05;
+	case GITS_CIDR3:
+		return 0xb1;
+	}
+
+	return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
+{
+	list_del(&itte->itte_list);
+	kfree(itte);
+}
+
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+				   u64 *its_cmd)
+{
+	return -ENODEV;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+				  GITS_BASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+				  GITS_BASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+				  GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	/* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */
+	reg &= ~GENMASK_ULL(15, 12);
+
+	/* We support only one (ITS) page size: 64K */
+	reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+	return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+				  GITS_CBASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+				  GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+				  GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	/*
+	 * Sanitise the physical address to be 64k aligned.
+	 * Also limit the physical addresses to 48 bits.
+	 */
+	reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12));
+
+	return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+					       struct vgic_its *its,
+					       gpa_t addr, unsigned int len)
+{
+	return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+				       gpa_t addr, unsigned int len,
+				       unsigned long val)
+{
+	/* When GITS_CTLR.Enable is 1, this register is RO. */
+	if (its->enabled)
+		return;
+
+	mutex_lock(&its->cmd_lock);
+	its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+	its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+	its->creadr = 0;
+	/*
+	 * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+	 * it to CREADR to make sure we start with an empty command buffer.
+	 */
+	its->cwriter = its->creadr;
+	mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser)	((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE			32
+#define ITS_CMD_OFFSET(reg)		((reg) & GENMASK(19, 5))
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+					gpa_t addr, unsigned int len,
+					unsigned long val)
+{
+	gpa_t cbaser;
+	u64 cmd_buf[4];
+	u32 reg;
+
+	if (!its)
+		return;
+
+	mutex_lock(&its->cmd_lock);
+
+	reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+	reg = ITS_CMD_OFFSET(reg);
+	if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+		mutex_unlock(&its->cmd_lock);
+		return;
+	}
+
+	its->cwriter = reg;
+	cbaser = CBASER_ADDRESS(its->cbaser);
+
+	while (its->cwriter != its->creadr) {
+		int ret = kvm_read_guest(kvm, cbaser + its->creadr,
+					 cmd_buf, ITS_CMD_SIZE);
+		/*
+		 * If kvm_read_guest() fails, this could be due to the guest
+		 * programming a bogus value in CBASER or something else going
+		 * wrong from which we cannot easily recover.
+		 * According to section 6.3.2 in the GICv3 spec we can just
+		 * ignore that command then.
+		 */
+		if (!ret)
+			vgic_its_handle_command(kvm, its, cmd_buf);
+
+		its->creadr += ITS_CMD_SIZE;
+		if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+			its->creadr = 0;
+	}
+
+	mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+						struct vgic_its *its,
+						gpa_t addr, unsigned int len)
+{
+	return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+					       struct vgic_its *its,
+					       gpa_t addr, unsigned int len)
+{
+	return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+					      struct vgic_its *its,
+					      gpa_t addr, unsigned int len)
+{
+	u64 reg;
+
+	switch (BASER_INDEX(addr)) {
+	case 0:
+		reg = its->baser_device_table;
+		break;
+	case 1:
+		reg = its->baser_coll_table;
+		break;
+	default:
+		reg = 0;
+		break;
+	}
+
+	return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK	(GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+				      struct vgic_its *its,
+				      gpa_t addr, unsigned int len,
+				      unsigned long val)
+{
+	u64 entry_size, device_type;
+	u64 reg, *regptr, clearbits = 0;
+
+	/* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+	if (its->enabled)
+		return;
+
+	switch (BASER_INDEX(addr)) {
+	case 0:
+		regptr = &its->baser_device_table;
+		entry_size = 8;
+		device_type = GITS_BASER_TYPE_DEVICE;
+		break;
+	case 1:
+		regptr = &its->baser_coll_table;
+		entry_size = 8;
+		device_type = GITS_BASER_TYPE_COLLECTION;
+		clearbits = GITS_BASER_INDIRECT;
+		break;
+	default:
+		return;
+	}
+
+	reg = update_64bit_reg(*regptr, addr & 7, len, val);
+	reg &= ~GITS_BASER_RO_MASK;
+	reg &= ~clearbits;
+
+	reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+	reg |= device_type << GITS_BASER_TYPE_SHIFT;
+	reg = vgic_sanitise_its_baser(reg);
+
+	*regptr = reg;
+}
+
 #define REGISTER_ITS_DESC(off, rd, wr, length, acc)		\
 {								\
 	.reg_offset = off,					\
@@ -41,12 +365,6 @@
 	.its_write = wr,					\
 }
 
-static unsigned long its_mmio_read_raz(struct kvm *kvm, struct vgic_its *its,
-				       gpa_t addr, unsigned int len)
-{
-	return 0;
-}
-
 static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
 			      gpa_t addr, unsigned int len, unsigned long val)
 {
@@ -55,28 +373,28 @@ static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
 
 static struct vgic_register_region its_registers[] = {
 	REGISTER_ITS_DESC(GITS_CTLR,
-		its_mmio_read_raz, its_mmio_write_wi, 4,
+		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
 		VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_IIDR,
-		its_mmio_read_raz, its_mmio_write_wi, 4,
+		vgic_mmio_read_its_iidr, its_mmio_write_wi, 4,
 		VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_TYPER,
-		its_mmio_read_raz, its_mmio_write_wi, 8,
+		vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_CBASER,
-		its_mmio_read_raz, its_mmio_write_wi, 8,
+		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_CWRITER,
-		its_mmio_read_raz, its_mmio_write_wi, 8,
+		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_CREADR,
-		its_mmio_read_raz, its_mmio_write_wi, 8,
+		vgic_mmio_read_its_creadr, its_mmio_write_wi, 8,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_BASER,
-		its_mmio_read_raz, its_mmio_write_wi, 0x40,
+		vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
 		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_IDREGS_BASE,
-		its_mmio_read_raz, its_mmio_write_wi, 0x30,
+		vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
 		VGIC_ACCESS_32bit),
 };
 
@@ -109,6 +427,18 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
 	return ret;
 }
 
+#define INITIAL_BASER_VALUE						  \
+	(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)		| \
+	 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)		| \
+	 GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)		| \
+	 ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)			| \
+	 GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE						  \
+	(GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)		| \
+	 GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)	| \
+	 GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
 static int vgic_its_create(struct kvm_device *dev, u32 type)
 {
 	struct vgic_its *its;
@@ -120,12 +450,24 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
 	if (!its)
 		return -ENOMEM;
 
+	mutex_init(&its->its_lock);
+	mutex_init(&its->cmd_lock);
+
 	its->vgic_its_base = VGIC_ADDR_UNDEF;
 
+	INIT_LIST_HEAD(&its->device_list);
+	INIT_LIST_HEAD(&its->collection_list);
+
 	dev->kvm->arch.vgic.has_its = true;
 	its->initialized = false;
 	its->enabled = false;
 
+	its->baser_device_table = INITIAL_BASER_VALUE			|
+		((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+	its->baser_coll_table = INITIAL_BASER_VALUE |
+		((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+	dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
 	dev->private = its;
 
 	return 0;
@@ -133,7 +475,36 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
 
 static void vgic_its_destroy(struct kvm_device *kvm_dev)
 {
+	struct kvm *kvm = kvm_dev->kvm;
 	struct vgic_its *its = kvm_dev->private;
+	struct its_device *dev;
+	struct its_itte *itte;
+	struct list_head *dev_cur, *dev_temp;
+	struct list_head *cur, *temp;
+
+	/*
+	 * We may end up here without the lists ever having been initialized.
+	 * Check this and bail out early to avoid dereferencing a NULL pointer.
+	 */
+	if (!its->device_list.next)
+		return;
+
+	mutex_lock(&its->its_lock);
+	list_for_each_safe(dev_cur, dev_temp, &its->device_list) {
+		dev = container_of(dev_cur, struct its_device, dev_list);
+		list_for_each_safe(cur, temp, &dev->itt_head) {
+			itte = (container_of(cur, struct its_itte, itte_list));
+			its_free_itte(kvm, itte);
+		}
+		list_del(dev_cur);
+		kfree(dev);
+	}
+
+	list_for_each_safe(cur, temp, &its->collection_list) {
+		list_del(cur);
+		kfree(container_of(cur, struct its_collection, coll_list));
+	}
+	mutex_unlock(&its->its_lock);
 
 	kfree(its);
 }
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index a5c35050c7864d..84a301d789e015 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -23,15 +23,15 @@
 #include "vgic-mmio.h"
 
 /* extract @num bytes at @offset bytes offset in data */
-static unsigned long extract_bytes(unsigned long data, unsigned int offset,
-				   unsigned int num)
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+			    unsigned int num)
 {
 	return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
 }
 
 /* allows updates of any half of a 64-bit register (or the whole thing) */
-static u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
-			    unsigned long val)
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+		     unsigned long val)
 {
 	int lower = (offset & 4) * 8;
 	int upper = lower + 8 * len - 1;
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 366d66378732b0..0b3ecf9d100eca 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -96,6 +96,12 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
 void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
 				unsigned long data);
 
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+			    unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+		     unsigned long val);
+
 unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
 				 gpa_t addr, unsigned int len);
 
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index fb19a554d090c0..d3ba1b4227e7d3 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -33,10 +33,16 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
 
 /*
  * Locking order is always:
- *   vgic_cpu->ap_list_lock
- *     vgic_irq->irq_lock
+ * its->cmd_lock (mutex)
+ *   its->its_lock (mutex)
+ *     vgic_cpu->ap_list_lock
+ *       vgic_irq->irq_lock
  *
- * (that is, always take the ap_list_lock before the struct vgic_irq lock).
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
  *
  * When taking more than one ap_list_lock at the same time, always take the
  * lowest numbered VCPU's ap_list_lock first, so:

From 3802411d01880c4283426d22653e011159b1c947 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:33 +0100
Subject: [PATCH 267/302] KVM: arm64: vgic-its: Connect LPIs to the VGIC
 emulation

LPIs are dynamically created (mapped) at guest runtime and their
actual number can be quite high, but is mostly assigned using a very
sparse allocation scheme. So arrays are not an ideal data structure
to hold the information.
We use a spin-lock protected linked list to hold all mapped LPIs,
represented by their struct vgic_irq. This lock is grouped between the
ap_list_lock and the vgic_irq lock in our locking order.
Also we store a pointer to that struct vgic_irq in our struct its_itte,
so we can easily access it.
Eventually we call our new vgic_get_lpi() from vgic_get_irq(), so
the VGIC code gets transparently access to LPIs.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h        |  6 ++++
 virt/kvm/arm/vgic/vgic-init.c |  3 ++
 virt/kvm/arm/vgic/vgic-its.c  |  5 +++
 virt/kvm/arm/vgic/vgic-v3.c   |  2 ++
 virt/kvm/arm/vgic/vgic.c      | 63 +++++++++++++++++++++++++++++++----
 5 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 61867492d361d0..a6ca326055cffd 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -77,6 +77,7 @@ enum vgic_irq_config {
 
 struct vgic_irq {
 	spinlock_t irq_lock;		/* Protects the content of the struct */
+	struct list_head lpi_list;	/* Used to link all LPIs together */
 	struct list_head ap_list;
 
 	struct kvm_vcpu *vcpu;		/* SGIs and PPIs: The VCPU
@@ -193,6 +194,11 @@ struct vgic_dist {
 	 * GICv3 spec: 6.1.2 "LPI Configuration tables"
 	 */
 	u64			propbaser;
+
+	/* Protects the lpi_list and the count value below. */
+	spinlock_t		lpi_list_lock;
+	struct list_head	lpi_list_head;
+	int			lpi_list_count;
 };
 
 struct vgic_v2_cpu_if {
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index ac3c1a5f7bf485..535e713704f08f 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -157,6 +157,9 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
 	int i;
 
+	INIT_LIST_HEAD(&dist->lpi_list_head);
+	spin_lock_init(&dist->lpi_list_lock);
+
 	dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
 	if (!dist->spis)
 		return  -ENOMEM;
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 11cfe2f12c6cda..14f91ff487ccee 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -56,6 +56,7 @@ struct its_collection {
 struct its_itte {
 	struct list_head itte_list;
 
+	struct vgic_irq *irq;
 	struct its_collection *collection;
 	u32 lpi;
 	u32 event_id;
@@ -148,6 +149,10 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
 static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
 {
 	list_del(&itte->itte_list);
+
+	/* This put matches the get in vgic_add_lpi. */
+	vgic_put_irq(kvm, itte->irq);
+
 	kfree(itte);
 }
 
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 6f8f31f910e731..0506543df38a7a 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -81,6 +81,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 		else
 			intid = val & GICH_LR_VIRTUALID;
 		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+		if (!irq)	/* An LPI could have been unmapped. */
+			continue;
 
 		spin_lock(&irq->irq_lock);
 
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index d3ba1b4227e7d3..53299fc93c157c 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -36,7 +36,8 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
  * its->cmd_lock (mutex)
  *   its->its_lock (mutex)
  *     vgic_cpu->ap_list_lock
- *       vgic_irq->irq_lock
+ *       kvm->lpi_list_lock
+ *         vgic_irq->irq_lock
  *
  * If you need to take multiple locks, always take the upper lock first,
  * then the lower ones, e.g. first take the its_lock, then the irq_lock.
@@ -51,6 +52,41 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
  *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
  */
 
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq = NULL;
+
+	spin_lock(&dist->lpi_list_lock);
+
+	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+		if (irq->intid != intid)
+			continue;
+
+		/*
+		 * This increases the refcount, the caller is expected to
+		 * call vgic_put_irq() later once it's finished with the IRQ.
+		 */
+		kref_get(&irq->refcount);
+		goto out_unlock;
+	}
+	irq = NULL;
+
+out_unlock:
+	spin_unlock(&dist->lpi_list_lock);
+
+	return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid)
 {
@@ -62,9 +98,9 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 	if (intid <= VGIC_MAX_SPI)
 		return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
 
-	/* LPIs are not yet covered */
+	/* LPIs */
 	if (intid >= VGIC_MIN_LPI)
-		return NULL;
+		return vgic_get_lpi(kvm, intid);
 
 	WARN(1, "Looking up struct vgic_irq for reserved INTID");
 	return NULL;
@@ -78,18 +114,33 @@ static void vgic_get_irq_kref(struct vgic_irq *irq)
 	kref_get(&irq->refcount);
 }
 
-/* The refcount should never drop to 0 at the moment. */
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
 static void vgic_irq_release(struct kref *ref)
 {
-	WARN_ON(1);
 }
 
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 {
+	struct vgic_dist *dist;
+
 	if (irq->intid < VGIC_MIN_LPI)
 		return;
 
-	kref_put(&irq->refcount, vgic_irq_release);
+	if (!kref_put(&irq->refcount, vgic_irq_release))
+		return;
+
+	dist = &kvm->arch.vgic;
+
+	spin_lock(&dist->lpi_list_lock);
+	list_del(&irq->lpi_list);
+	dist->lpi_list_count--;
+	spin_unlock(&dist->lpi_list_lock);
+
+	kfree(irq);
 }
 
 /**

From 33d3bc9556a7dda5bba2cb6b2d08ae4841ae423e Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:34 +0100
Subject: [PATCH 268/302] KVM: arm64: vgic-its: Read initial LPI pending table

The LPI pending status for a GICv3 redistributor is held in a table
in (guest) memory. To achieve reasonable performance, we cache the
pending bit in our struct vgic_irq. The initial pending state must be
read from guest memory upon enabling LPIs for this redistributor.
As we can't access the guest memory while we hold the lpi_list spinlock,
we create a snapshot of the LPI list and iterate over that.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 95 ++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.h     |  5 ++
 2 files changed, 100 insertions(+)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 14f91ff487ccee..2881b84cbf75fc 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -67,6 +67,94 @@ struct its_itte {
  * supports more. Let's be restrictive here.
  */
 #define CBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
+#define PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
+
+/*
+ * Create a snapshot of the current LPI list, so that we can enumerate all
+ * LPIs without holding any lock.
+ * Returns the array length and puts the kmalloc'ed array into intid_ptr.
+ */
+static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq;
+	u32 *intids;
+	int irq_count = dist->lpi_list_count, i = 0;
+
+	/*
+	 * We use the current value of the list length, which may change
+	 * after the kmalloc. We don't care, because the guest shouldn't
+	 * change anything while the command handling is still running,
+	 * and in the worst case we would miss a new IRQ, which one wouldn't
+	 * expect to be covered by this command anyway.
+	 */
+	intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+	if (!intids)
+		return -ENOMEM;
+
+	spin_lock(&dist->lpi_list_lock);
+	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+		/* We don't need to "get" the IRQ, as we hold the list lock. */
+		intids[i] = irq->intid;
+		if (++i == irq_count)
+			break;
+	}
+	spin_unlock(&dist->lpi_list_lock);
+
+	*intid_ptr = intids;
+	return irq_count;
+}
+
+/*
+ * Scan the whole LPI pending table and sync the pending bit in there
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+	gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+	struct vgic_irq *irq;
+	int last_byte_offset = -1;
+	int ret = 0;
+	u32 *intids;
+	int nr_irqs, i;
+
+	nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids);
+	if (nr_irqs < 0)
+		return nr_irqs;
+
+	for (i = 0; i < nr_irqs; i++) {
+		int byte_offset, bit_nr;
+		u8 pendmask;
+
+		byte_offset = intids[i] / BITS_PER_BYTE;
+		bit_nr = intids[i] % BITS_PER_BYTE;
+
+		/*
+		 * For contiguously allocated LPIs chances are we just read
+		 * this very same byte in the last iteration. Reuse that.
+		 */
+		if (byte_offset != last_byte_offset) {
+			ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset,
+					     &pendmask, 1);
+			if (ret) {
+				kfree(intids);
+				return ret;
+			}
+			last_byte_offset = byte_offset;
+		}
+
+		irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+		spin_lock(&irq->irq_lock);
+		irq->pending = pendmask & (1U << bit_nr);
+		vgic_queue_irq_unlock(vcpu->kvm, irq);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	kfree(intids);
+
+	return ret;
+}
 
 static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
 					     struct vgic_its *its,
@@ -403,6 +491,13 @@ static struct vgic_register_region its_registers[] = {
 		VGIC_ACCESS_32bit),
 };
 
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+		its_sync_lpi_pending_table(vcpu);
+}
+
 static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
 {
 	struct vgic_io_device *iodev = &its->iodev;
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 8192a293f119cb..ee348deb873792 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -25,6 +25,7 @@
 #define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
 
 #define INTERRUPT_ID_BITS_SPIS	10
+#define INTERRUPT_ID_BITS_ITS	16
 #define VGIC_PRI_BITS		5
 
 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
@@ -76,6 +77,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
 bool vgic_has_its(struct kvm *kvm);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
 #else
 static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
 {
@@ -133,6 +135,9 @@ static inline bool vgic_has_its(struct kvm *kvm)
 	return false;
 }
 
+static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+}
 #endif
 
 int kvm_register_vgic_device(unsigned long type);

From f9f77af9e2a551ac34eb0eb40630d91d6dbd4295 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:35 +0100
Subject: [PATCH 269/302] KVM: arm64: vgic-its: Allow updates of LPI
 configuration table

The (system-wide) LPI configuration table is held in a table in
(guest) memory. To achieve reasonable performance, we cache this data
in our struct vgic_irq. If the guest updates the configuration data
(which consists of the enable bit and the priority value), it issues
an INV or INVALL command to allow us to update our information.
Provide functions that update that information for one LPI or all LPIs
mapped to a specific collection.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 39 ++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 2881b84cbf75fc..6f43b3b1172b73 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -68,6 +68,45 @@ struct its_itte {
  */
 #define CBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
 #define PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
+#define PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
+
+#define GIC_LPI_OFFSET 8192
+
+#define LPI_PROP_ENABLE_BIT(p)	((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p)	((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+			     struct kvm_vcpu *filter_vcpu)
+{
+	u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+	u8 prop;
+	int ret;
+
+	ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+			     &prop, 1);
+
+	if (ret)
+		return ret;
+
+	spin_lock(&irq->irq_lock);
+
+	if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+		irq->priority = LPI_PROP_PRIORITY(prop);
+		irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+		vgic_queue_irq_unlock(kvm, irq);
+	} else {
+		spin_unlock(&irq->irq_lock);
+	}
+
+	return 0;
+}
 
 /*
  * Create a snapshot of the current LPI list, so that we can enumerate all

From df9f58fbea9bc656b5a7770c885c97b26255b234 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:36 +0100
Subject: [PATCH 270/302] KVM: arm64: vgic-its: Implement ITS command queue
 command handlers

The connection between a device, an event ID, the LPI number and the
associated CPU is stored in in-memory tables in a GICv3, but their
format is not specified by the spec. Instead software uses a command
queue in a ring buffer to let an ITS implementation use its own
format.
Implement handlers for the various ITS commands and let them store
the requested relation into our own data structures. Those data
structures are protected by the its_lock mutex.
Our internal ring buffer read and write pointers are protected by the
its_cmd mutex, so that only one VCPU per ITS can handle commands at
any given time.
Error handling is very basic at the moment, as we don't have a good
way of communicating errors to the guest (usually an SError).
The INT command handler is missing from this patch, as we gain the
capability of actually injecting MSIs into the guest only later on.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 661 ++++++++++++++++++++++++++++++++++-
 1 file changed, 660 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 6f43b3b1172b73..1408c88d063e17 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -33,6 +33,67 @@
 #include "vgic.h"
 #include "vgic-mmio.h"
 
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+
+	/* In this case there is no put, since we keep the reference. */
+	if (irq)
+		return irq;
+
+	irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+	if (!irq)
+		return NULL;
+
+	INIT_LIST_HEAD(&irq->lpi_list);
+	INIT_LIST_HEAD(&irq->ap_list);
+	spin_lock_init(&irq->irq_lock);
+
+	irq->config = VGIC_CONFIG_EDGE;
+	kref_init(&irq->refcount);
+	irq->intid = intid;
+
+	spin_lock(&dist->lpi_list_lock);
+
+	/*
+	 * There could be a race with another vgic_add_lpi(), so we need to
+	 * check that we don't add a second list entry with the same LPI.
+	 */
+	list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+		if (oldirq->intid != intid)
+			continue;
+
+		/* Someone was faster with adding this LPI, lets use that. */
+		kfree(irq);
+		irq = oldirq;
+
+		/*
+		 * This increases the refcount, the caller is expected to
+		 * call vgic_put_irq() on the returned pointer once it's
+		 * finished with the IRQ.
+		 */
+		kref_get(&irq->refcount);
+
+		goto out_unlock;
+	}
+
+	list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+	dist->lpi_list_count++;
+
+out_unlock:
+	spin_unlock(&dist->lpi_list_lock);
+
+	return irq;
+}
+
 struct its_device {
 	struct list_head dev_list;
 
@@ -62,16 +123,75 @@ struct its_itte {
 	u32 event_id;
 };
 
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+	struct its_device *device;
+
+	list_for_each_entry(device, &its->device_list, dev_list)
+		if (device_id == device->device_id)
+			return device;
+
+	return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_itte *find_itte(struct vgic_its *its, u32 device_id,
+				  u32 event_id)
+{
+	struct its_device *device;
+	struct its_itte *itte;
+
+	device = find_its_device(its, device_id);
+	if (device == NULL)
+		return NULL;
+
+	list_for_each_entry(itte, &device->itt_head, itte_list)
+		if (itte->event_id == event_id)
+			return itte;
+
+	return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, itte, its) \
+	list_for_each_entry(dev, &(its)->device_list, dev_list) \
+		list_for_each_entry(itte, &(dev)->itt_head, itte_list)
+
 /*
  * We only implement 48 bits of PA at the moment, although the ITS
  * supports more. Let's be restrictive here.
  */
+#define BASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
 #define CBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
 #define PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
 #define PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
 
 #define GIC_LPI_OFFSET 8192
 
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+	struct its_collection *collection;
+
+	list_for_each_entry(collection, &its->collection_list, coll_list) {
+		if (coll_id == collection->collection_id)
+			return collection;
+	}
+
+	return NULL;
+}
+
 #define LPI_PROP_ENABLE_BIT(p)	((p) & LPI_PROP_ENABLED)
 #define LPI_PROP_PRIORITY(p)	((p) & 0xfc)
 
@@ -144,6 +264,51 @@ static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr)
 	return irq_count;
 }
 
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte)
+{
+	struct kvm_vcpu *vcpu;
+
+	if (!its_is_collection_mapped(itte->collection))
+		return;
+
+	vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+
+	spin_lock(&itte->irq->irq_lock);
+	itte->irq->target_vcpu = vcpu;
+	spin_unlock(&itte->irq->irq_lock);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+				       struct its_collection *coll)
+{
+	struct its_device *device;
+	struct its_itte *itte;
+
+	for_each_lpi_its(device, itte, its) {
+		if (!itte->collection || coll != itte->collection)
+			continue;
+
+		update_affinity_itte(kvm, itte);
+	}
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+	int nr_idbits = (propbaser & 0x1f) + 1;
+
+	return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
 /*
  * Scan the whole LPI pending table and sync the pending bit in there
  * with our own data structures. This relies on the LPI being
@@ -283,10 +448,504 @@ static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
 	kfree(itte);
 }
 
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+	return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd)	its_cmd_mask_field(cmd, 0,  0,  8)
+#define its_cmd_get_deviceid(cmd)	its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_id(cmd)		its_cmd_mask_field(cmd, 1,  0, 32)
+#define its_cmd_get_physical_id(cmd)	its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd)	its_cmd_mask_field(cmd, 2,  0, 16)
+#define its_cmd_get_target_addr(cmd)	its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd)	its_cmd_mask_field(cmd, 2, 63,  1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+				       u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	struct its_itte *itte;
+
+
+	itte = find_itte(its, device_id, event_id);
+	if (itte && itte->collection) {
+		/*
+		 * Though the spec talks about removing the pending state, we
+		 * don't bother here since we clear the ITTE anyway and the
+		 * pending state is a property of the ITTE struct.
+		 */
+		its_free_itte(kvm, itte);
+		return 0;
+	}
+
+	return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	u32 coll_id = its_cmd_get_collection(its_cmd);
+	struct kvm_vcpu *vcpu;
+	struct its_itte *itte;
+	struct its_collection *collection;
+
+	itte = find_itte(its, device_id, event_id);
+	if (!itte)
+		return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+	if (!its_is_collection_mapped(itte->collection))
+		return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+	collection = find_collection(its, coll_id);
+	if (!its_is_collection_mapped(collection))
+		return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+	itte->collection = collection;
+	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+	spin_lock(&itte->irq->irq_lock);
+	itte->irq->target_vcpu = vcpu;
+	spin_unlock(&itte->irq->irq_lock);
+
+	return 0;
+}
+
+static void vgic_its_init_collection(struct vgic_its *its,
+				     struct its_collection *collection,
+				     u32 coll_id)
+{
+	collection->collection_id = coll_id;
+	collection->target_addr = COLLECTION_NOT_MAPPED;
+
+	list_add_tail(&collection->coll_list, &its->collection_list);
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd, u8 subcmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	u32 coll_id = its_cmd_get_collection(its_cmd);
+	struct its_itte *itte;
+	struct its_device *device;
+	struct its_collection *collection, *new_coll = NULL;
+	int lpi_nr;
+
+	device = find_its_device(its, device_id);
+	if (!device)
+		return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+	collection = find_collection(its, coll_id);
+	if (!collection) {
+		new_coll = kzalloc(sizeof(struct its_collection), GFP_KERNEL);
+		if (!new_coll)
+			return -ENOMEM;
+	}
+
+	if (subcmd == GITS_CMD_MAPTI)
+		lpi_nr = its_cmd_get_physical_id(its_cmd);
+	else
+		lpi_nr = event_id;
+	if (lpi_nr < GIC_LPI_OFFSET ||
+	    lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) {
+		kfree(new_coll);
+		return E_ITS_MAPTI_PHYSICALID_OOR;
+	}
+
+	itte = find_itte(its, device_id, event_id);
+	if (!itte) {
+		itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL);
+		if (!itte) {
+			kfree(new_coll);
+			return -ENOMEM;
+		}
+
+		itte->event_id	= event_id;
+		list_add_tail(&itte->itte_list, &device->itt_head);
+	}
+
+	if (!collection) {
+		collection = new_coll;
+		vgic_its_init_collection(its, collection, coll_id);
+	}
+
+	itte->collection = collection;
+	itte->lpi = lpi_nr;
+	itte->irq = vgic_add_lpi(kvm, lpi_nr);
+	update_affinity_itte(kvm, itte);
+
+	/*
+	 * We "cache" the configuration table entries in out struct vgic_irq's.
+	 * However we only have those structs for mapped IRQs, so we read in
+	 * the respective config data from memory here upon mapping the LPI.
+	 */
+	update_lpi_config(kvm, itte->irq, NULL);
+
+	return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
+{
+	struct its_itte *itte, *temp;
+
+	/*
+	 * The spec says that unmapping a device with still valid
+	 * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+	 * since we cannot leave the memory unreferenced.
+	 */
+	list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list)
+		its_free_itte(kvm, itte);
+
+	list_del(&device->dev_list);
+	kfree(device);
+}
+
+/*
+ * Check whether a device ID can be stored into the guest device tables.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessbible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its,
+				     int device_id)
+{
+	u64 r = its->baser_device_table;
+	int nr_entries = GITS_BASER_NR_PAGES(r) * SZ_64K;
+	int index;
+	u64 indirect_ptr;
+	gfn_t gfn;
+
+
+	if (!(r & GITS_BASER_INDIRECT))
+		return device_id < (nr_entries / GITS_BASER_ENTRY_SIZE(r));
+
+	/* calculate and check the index into the 1st level */
+	index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r));
+	if (index >= (nr_entries / sizeof(u64)))
+		return false;
+
+	/* Each 1st level entry is represented by a 64-bit value. */
+	if (!kvm_read_guest(kvm,
+			    BASER_ADDRESS(r) + index * sizeof(indirect_ptr),
+			    &indirect_ptr, sizeof(indirect_ptr)))
+		return false;
+
+	/* check the valid bit of the first level entry */
+	if (!(indirect_ptr & BIT_ULL(63)))
+		return false;
+
+	/*
+	 * Mask the guest physical address and calculate the frame number.
+	 * Any address beyond our supported 48 bits of PA will be caught
+	 * by the actual check in the final step.
+	 */
+	gfn = (indirect_ptr & GENMASK_ULL(51, 16)) >> PAGE_SHIFT;
+
+	return kvm_is_visible_gfn(kvm, gfn);
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	bool valid = its_cmd_get_validbit(its_cmd);
+	struct its_device *device;
+
+	if (!vgic_its_check_device_id(kvm, its, device_id))
+		return E_ITS_MAPD_DEVICE_OOR;
+
+	device = find_its_device(its, device_id);
+
+	/*
+	 * The spec says that calling MAPD on an already mapped device
+	 * invalidates all cached data for this device. We implement this
+	 * by removing the mapping and re-establishing it.
+	 */
+	if (device)
+		vgic_its_unmap_device(kvm, device);
+
+	/*
+	 * The spec does not say whether unmapping a not-mapped device
+	 * is an error, so we are done in any case.
+	 */
+	if (!valid)
+		return 0;
+
+	device = kzalloc(sizeof(struct its_device), GFP_KERNEL);
+	if (!device)
+		return -ENOMEM;
+
+	device->device_id = device_id;
+	INIT_LIST_HEAD(&device->itt_head);
+
+	list_add_tail(&device->dev_list, &its->device_list);
+
+	return 0;
+}
+
+static int vgic_its_nr_collection_ids(struct vgic_its *its)
+{
+	u64 r = its->baser_coll_table;
+
+	return (GITS_BASER_NR_PAGES(r) * SZ_64K) / GITS_BASER_ENTRY_SIZE(r);
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd)
+{
+	u16 coll_id;
+	u32 target_addr;
+	struct its_collection *collection;
+	bool valid;
+
+	valid = its_cmd_get_validbit(its_cmd);
+	coll_id = its_cmd_get_collection(its_cmd);
+	target_addr = its_cmd_get_target_addr(its_cmd);
+
+	if (target_addr >= atomic_read(&kvm->online_vcpus))
+		return E_ITS_MAPC_PROCNUM_OOR;
+
+	if (coll_id >= vgic_its_nr_collection_ids(its))
+		return E_ITS_MAPC_COLLECTION_OOR;
+
+	collection = find_collection(its, coll_id);
+
+	if (!valid) {
+		struct its_device *device;
+		struct its_itte *itte;
+		/*
+		 * Clearing the mapping for that collection ID removes the
+		 * entry from the list. If there wasn't any before, we can
+		 * go home early.
+		 */
+		if (!collection)
+			return 0;
+
+		for_each_lpi_its(device, itte, its)
+			if (itte->collection &&
+			    itte->collection->collection_id == coll_id)
+				itte->collection = NULL;
+
+		list_del(&collection->coll_list);
+		kfree(collection);
+	} else {
+		if (!collection) {
+			collection = kzalloc(sizeof(struct its_collection),
+					     GFP_KERNEL);
+			if (!collection)
+				return -ENOMEM;
+
+			vgic_its_init_collection(its, collection, coll_id);
+			collection->target_addr = target_addr;
+		} else {
+			collection->target_addr = target_addr;
+			update_affinity_collection(kvm, its, collection);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+				     u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	struct its_itte *itte;
+
+
+	itte = find_itte(its, device_id, event_id);
+	if (!itte)
+		return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+	itte->irq->pending = false;
+
+	return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+				   u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	struct its_itte *itte;
+
+
+	itte = find_itte(its, device_id, event_id);
+	if (!itte)
+		return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+	return update_lpi_config(kvm, itte->irq, NULL);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+				      u64 *its_cmd)
+{
+	u32 coll_id = its_cmd_get_collection(its_cmd);
+	struct its_collection *collection;
+	struct kvm_vcpu *vcpu;
+	struct vgic_irq *irq;
+	u32 *intids;
+	int irq_count, i;
+
+	collection = find_collection(its, coll_id);
+	if (!its_is_collection_mapped(collection))
+		return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+	irq_count = vgic_copy_lpi_list(kvm, &intids);
+	if (irq_count < 0)
+		return irq_count;
+
+	for (i = 0; i < irq_count; i++) {
+		irq = vgic_get_irq(kvm, NULL, intids[i]);
+		if (!irq)
+			continue;
+		update_lpi_config(kvm, irq, vcpu);
+		vgic_put_irq(kvm, irq);
+	}
+
+	kfree(intids);
+
+	return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+				      u64 *its_cmd)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+	u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+	struct kvm_vcpu *vcpu1, *vcpu2;
+	struct vgic_irq *irq;
+
+	if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+	    target2_addr >= atomic_read(&kvm->online_vcpus))
+		return E_ITS_MOVALL_PROCNUM_OOR;
+
+	if (target1_addr == target2_addr)
+		return 0;
+
+	vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+	vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+	spin_lock(&dist->lpi_list_lock);
+
+	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+		spin_lock(&irq->irq_lock);
+
+		if (irq->target_vcpu == vcpu1)
+			irq->target_vcpu = vcpu2;
+
+		spin_unlock(&irq->irq_lock);
+	}
+
+	spin_unlock(&dist->lpi_list_lock);
+
+	return 0;
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
 static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
 				   u64 *its_cmd)
 {
-	return -ENODEV;
+	u8 cmd = its_cmd_get_command(its_cmd);
+	int ret = -ENODEV;
+
+	mutex_lock(&its->its_lock);
+	switch (cmd) {
+	case GITS_CMD_MAPD:
+		ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MAPC:
+		ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MAPI:
+		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd);
+		break;
+	case GITS_CMD_MAPTI:
+		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd);
+		break;
+	case GITS_CMD_MOVI:
+		ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_DISCARD:
+		ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_CLEAR:
+		ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MOVALL:
+		ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_INV:
+		ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_INVALL:
+		ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_SYNC:
+		/* we ignore this command: we are in sync all of the time */
+		ret = 0;
+		break;
+	}
+	mutex_unlock(&its->its_lock);
+
+	return ret;
 }
 
 static u64 vgic_sanitise_its_baser(u64 reg)

From 2891a7dfb6c4a273996f0047660a75e88e3b8690 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:37 +0100
Subject: [PATCH 271/302] KVM: arm64: vgic-its: Implement MSI injection in ITS
 emulation

When userland wants to inject an MSI into the guest, it uses the
KVM_SIGNAL_MSI ioctl, which carries the doorbell address along with
the payload and the device ID.
With the help of the KVM IO bus framework we learn the corresponding
ITS from the doorbell address. We then use our wrapper functions to
iterate the linked lists and find the proper Interrupt Translation Table
Entry (ITTE) and thus the corresponding struct vgic_irq to finally set
the pending bit.
We also provide the handler for the ITS "INT" command, which allows a
guest to trigger an MSI via the ITS command queue. Since this one knows
about the right ITS already, we directly call the MMIO handler function
without using the kvm_io_bus framework.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 77 ++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.h     |  6 +++
 2 files changed, 83 insertions(+)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 1408c88d063e17..d8e8f14135b494 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -437,6 +437,65 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
 	return 0;
 }
 
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ */
+static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+				 u32 devid, u32 eventid)
+{
+	struct its_itte *itte;
+
+	if (!its->enabled)
+		return;
+
+	itte = find_itte(its, devid, eventid);
+	/* Triggering an unmapped IRQ gets silently dropped. */
+	if (itte && its_is_collection_mapped(itte->collection)) {
+		struct kvm_vcpu *vcpu;
+
+		vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+		if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) {
+			spin_lock(&itte->irq->irq_lock);
+			itte->irq->pending = true;
+			vgic_queue_irq_unlock(kvm, itte->irq);
+		}
+	}
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	u64 address;
+	struct kvm_io_device *kvm_io_dev;
+	struct vgic_io_device *iodev;
+
+	if (!vgic_has_its(kvm))
+		return -ENODEV;
+
+	if (!(msi->flags & KVM_MSI_VALID_DEVID))
+		return -EINVAL;
+
+	address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+	kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+	if (!kvm_io_dev)
+		return -ENODEV;
+
+	iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+
+	mutex_lock(&iodev->its->its_lock);
+	vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data);
+	mutex_unlock(&iodev->its->its_lock);
+
+	return 0;
+}
+
 /* Requires the its_lock to be held. */
 static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
 {
@@ -896,6 +955,21 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
 	return 0;
 }
 
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+				   u64 *its_cmd)
+{
+	u32 msi_data = its_cmd_get_id(its_cmd);
+	u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+	vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+
+	return 0;
+}
+
 /*
  * This function is called with the its_cmd lock held, but the ITS data
  * structure lock dropped.
@@ -932,6 +1006,9 @@ static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
 	case GITS_CMD_MOVALL:
 		ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
 		break;
+	case GITS_CMD_INT:
+		ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+		break;
 	case GITS_CMD_INV:
 		ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
 		break;
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index ee348deb873792..9d557f25cbfc20 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -78,6 +78,7 @@ int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
 bool vgic_has_its(struct kvm *kvm);
 void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
 #else
 static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
 {
@@ -138,6 +139,11 @@ static inline bool vgic_has_its(struct kvm *kvm)
 static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
 {
 }
+
+static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	return -ENODEV;
+}
 #endif
 
 int kvm_register_vgic_device(unsigned long type);

From 0e4e82f154e387969ea7ecd2c8876689fb68f710 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 15 Jul 2016 12:43:38 +0100
Subject: [PATCH 272/302] KVM: arm64: vgic-its: Enable ITS emulation as a
 virtual MSI controller

Now that all ITS emulation functionality is in place, we advertise
MSI functionality to userland and also the ITS device to the guest - if
userland has configured that.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt   |  2 +-
 arch/arm64/kvm/Kconfig              |  1 +
 arch/arm64/kvm/Makefile             |  1 +
 arch/arm64/kvm/reset.c              |  6 ++++++
 include/kvm/arm_vgic.h              |  5 +++++
 virt/kvm/arm/vgic/vgic-init.c       |  3 +++
 virt/kvm/arm/vgic/vgic-kvm-device.c |  3 +++
 virt/kvm/arm/vgic/vgic-mmio-v3.c    | 14 ++++++++++----
 virt/kvm/arm/vgic/vgic.c            |  8 ++++++++
 virt/kvm/arm/vgic/vgic.h            |  6 ++++++
 10 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 65513119fee87d..07049eadb12432 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2162,7 +2162,7 @@ after pausing the vcpu, but before it is resumed.
 4.71 KVM_SIGNAL_MSI
 
 Capability: KVM_CAP_SIGNAL_MSI
-Architectures: x86
+Architectures: x86 arm64
 Type: vm ioctl
 Parameters: struct kvm_msi (in)
 Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index aa2e34e99582df..9d2eff0b3ad347 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
 	select HAVE_KVM_IRQFD
 	select KVM_ARM_VGIC_V3
 	select KVM_ARM_PMU if HW_PERF_EVENTS
+	select HAVE_KVM_MSI
 	---help---
 	  Support hosting virtualized guest machines.
 	  We don't support KVM with 16K page tables yet, due to the multiple
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index f00b2cdd0d337d..a5b96642a9cb9c 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -29,5 +29,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e95d4f68bf544f..5bc460884639f1 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -86,6 +86,12 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VCPU_ATTRIBUTES:
 		r = 1;
 		break;
+	case KVM_CAP_MSI_DEVID:
+		if (!kvm)
+			r = -EINVAL;
+		else
+			r = kvm->arch.vgic.msis_require_devid;
+		break;
 	default:
 		r = 0;
 	}
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index a6ca326055cffd..4e63a07b9001f5 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -163,6 +163,9 @@ struct vgic_dist {
 	/* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
 	u32			vgic_model;
 
+	/* Do injected MSIs require an additional device ID? */
+	bool			msis_require_devid;
+
 	int			nr_spis;
 
 	/* TODO: Consider moving to global state */
@@ -308,4 +311,6 @@ static inline int kvm_vgic_get_max_vcpus(void)
 	return kvm_vgic_global_state.max_gic_vcpus;
 }
 
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
 #endif /* __KVM_ARM_VGIC_H */
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 535e713704f08f..01a60dcd05d626 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -258,6 +258,9 @@ int vgic_init(struct kvm *kvm)
 	if (ret)
 		goto out;
 
+	if (vgic_has_its(kvm))
+		dist->msis_require_devid = true;
+
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_vgic_vcpu_init(vcpu);
 
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 561d2ba96a4f2e..1813f93b5cde0a 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -223,6 +223,9 @@ int kvm_register_vgic_device(unsigned long type)
 	case KVM_DEV_TYPE_ARM_VGIC_V3:
 		ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
 					      KVM_DEV_TYPE_ARM_VGIC_V3);
+		if (ret)
+			break;
+		ret = kvm_vgic_register_its_device();
 		break;
 #endif
 	}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 84a301d789e015..ff668e0dd586de 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -66,7 +66,12 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
 	case GICD_TYPER:
 		value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
 		value = (value >> 5) - 1;
-		value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+		if (vgic_has_its(vcpu->kvm)) {
+			value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+			value |= GICD_TYPER_LPIS;
+		} else {
+			value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+		}
 		break;
 	case GICD_IIDR:
 		value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
@@ -163,9 +168,8 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
 
 	vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
 
-	if (!was_enabled && vgic_cpu->lpis_enabled) {
-		/* Eventually do something */
-	}
+	if (!was_enabled && vgic_cpu->lpis_enabled)
+		vgic_enable_lpis(vcpu);
 }
 
 static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
@@ -179,6 +183,8 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
 	value |= ((target_vcpu_id & 0xffff) << 8);
 	if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
 		value |= GICR_TYPER_LAST;
+	if (vgic_has_its(vcpu->kvm))
+		value |= GICR_TYPER_PLPIS;
 
 	return extract_bytes(value, addr & 7, len);
 }
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 53299fc93c157c..424cb9ceebd978 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -718,3 +718,11 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 
 	return map_is_active;
 }
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	if (vgic_has_its(kvm))
+		return vgic_its_inject_msi(kvm, msi);
+	else
+		return -ENODEV;
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 9d557f25cbfc20..9d40d7bb89f7ee 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -77,6 +77,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
 bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
 void vgic_enable_lpis(struct kvm_vcpu *vcpu);
 int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
 #else
@@ -136,6 +137,11 @@ static inline bool vgic_has_its(struct kvm *kvm)
 	return false;
 }
 
+static inline int kvm_vgic_register_its_device(void)
+{
+	return -ENODEV;
+}
+
 static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
 {
 }

From 9d5fcb9dd74b5e0070ef2f66f7f4ae14a23b0206 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Mon, 18 Jul 2016 10:57:36 +0000
Subject: [PATCH 273/302] KVM: arm/arm64: Fix vGICv2
 KVM_DEV_ARM_VGIC_GRP_CPU/DIST_REGS

For VGICv2 save and restore the CPU interface registers
are accessed. Restore the modality which has been altered.
Also explicitly set the iodev_type for both the DIST and CPU
interface.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-mmio-v2.c | 2 ++
 virt/kvm/arm/vgic/vgic-mmio.c    | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 4152348f5e4f14..b44b359cbbadee 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -437,6 +437,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
 	struct vgic_io_device dev = {
 		.regions = vgic_v2_cpu_registers,
 		.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+		.iodev_type = IODEV_CPUIF,
 	};
 
 	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
@@ -448,6 +449,7 @@ int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
 	struct vgic_io_device dev = {
 		.regions = vgic_v2_dist_registers,
 		.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+		.iodev_type = IODEV_DIST,
 	};
 
 	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 26be827bbfcc54..3bad3c5ed431cb 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -484,7 +484,8 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 
 	switch (iodev->iodev_type) {
 	case IODEV_CPUIF:
-		return 1;
+		data = region->read(vcpu, addr, len);
+		break;
 	case IODEV_DIST:
 		data = region->read(vcpu, addr, len);
 		break;
@@ -517,6 +518,7 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 
 	switch (iodev->iodev_type) {
 	case IODEV_CPUIF:
+		region->write(vcpu, addr, len, data);
 		break;
 	case IODEV_DIST:
 		region->write(vcpu, addr, len, data);

From 8c828a535e29f50282f1a49a52c3b20ccaa039aa Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 18 Jul 2016 15:28:52 +0100
Subject: [PATCH 274/302] irqchip/gicv3-its: Restore all cacheability
 attributes

Let's restore some of the #defines that have been savagely dropped
by the introduction of the KVM ITS code, as pointlessly break
other users (including series that are already in -next).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 48 +++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 9442be7f2461b9..700b4216c87a35 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -146,8 +146,16 @@
 
 #define GICR_PROPBASER_InnerShareable					\
 	GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
-#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
-#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+
+#define GICR_PROPBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+#define GICR_PROPBASER_nC 	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+#define GICR_PROPBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+#define GICR_PROPBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+#define GICR_PROPBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+#define GICR_PROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+
 #define GICR_PROPBASER_IDBITS_MASK			(0x1f)
 
 #define GICR_PENDBASER_SHAREABILITY_SHIFT		(10)
@@ -163,8 +171,16 @@
 
 #define GICR_PENDBASER_InnerShareable					\
 	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
-#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
-#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+
+#define GICR_PENDBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+#define GICR_PENDBASER_nC 	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+#define GICR_PENDBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+#define GICR_PENDBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+#define GICR_PENDBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+#define GICR_PENDBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+
 #define GICR_PENDBASER_PTZ				BIT_ULL(62)
 
 /*
@@ -237,24 +253,40 @@
 
 #define GITS_CBASER_InnerShareable					\
 	GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
-#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
-#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+
+#define GITS_CBASER_nCnB	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+#define GITS_CBASER_nC		GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+#define GITS_CBASER_RaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_RaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_WaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+#define GITS_CBASER_WaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+#define GITS_CBASER_RaWaWt	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+#define GITS_CBASER_RaWaWb	GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
 
 #define GITS_BASER_NR_REGS		8
 
 #define GITS_BASER_VALID			(1UL << 63)
 #define GITS_BASER_INDIRECT			(1ULL << 62)
+
 #define GITS_BASER_INNER_CACHEABILITY_SHIFT	(59)
 #define GITS_BASER_OUTER_CACHEABILITY_SHIFT	(53)
 #define GITS_BASER_INNER_CACHEABILITY_MASK				\
 	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+#define GITS_BASER_CACHEABILITY_MASK		GITS_BASER_INNER_CACHEABILITY_MASK
 #define GITS_BASER_OUTER_CACHEABILITY_MASK				\
 	GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
 #define GITS_BASER_SHAREABILITY_MASK					\
 	GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
 
-#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
-#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_nCnB		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+#define GITS_BASER_nC		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+#define GITS_BASER_RaWt		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_RaWb		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_WaWt		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+#define GITS_BASER_WaWb		GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_RaWaWt	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+#define GITS_BASER_RaWaWb	GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+
 #define GITS_BASER_TYPE_SHIFT			(56)
 #define GITS_BASER_TYPE(r)		(((r) >> GITS_BASER_TYPE_SHIFT) & 7)
 #define GITS_BASER_ENTRY_SIZE_SHIFT		(48)

From d97594e6bc1b4aaad3ccae3ef678513b63dd5221 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 11:27:23 +0100
Subject: [PATCH 275/302] KVM: arm64: vgic-its: Generalize use of
 vgic_get_irq_kref

Instead of sprinkling raw kref_get() calls everytime we cannot
do a normal vgic_get_irq(), use the existing vgic_get_irq_kref(),
which does the same thing and is paired with a vgic_put_irq().

vgic_get_irq_kref is moved to vgic.h in order to be easily shared.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c |  2 +-
 virt/kvm/arm/vgic/vgic.c     | 10 +---------
 virt/kvm/arm/vgic/vgic.h     |  8 ++++++++
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index d8e8f14135b494..f427fa2f7263f4 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -80,7 +80,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid)
 		 * call vgic_put_irq() on the returned pointer once it's
 		 * finished with the IRQ.
 		 */
-		kref_get(&irq->refcount);
+		vgic_get_irq_kref(irq);
 
 		goto out_unlock;
 	}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 424cb9ceebd978..39f3358c6d91a1 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
 		 * This increases the refcount, the caller is expected to
 		 * call vgic_put_irq() later once it's finished with the IRQ.
 		 */
-		kref_get(&irq->refcount);
+		vgic_get_irq_kref(irq);
 		goto out_unlock;
 	}
 	irq = NULL;
@@ -106,14 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 	return NULL;
 }
 
-static void vgic_get_irq_kref(struct vgic_irq *irq)
-{
-	if (irq->intid < VGIC_MIN_LPI)
-		return;
-
-	kref_get(&irq->refcount);
-}
-
 /*
  * We can't do anything in here, because we lack the kvm pointer to
  * lock and remove the item from the lpi_list. So we keep this function
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 9d40d7bb89f7ee..1d8e21d5c13f58 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -64,6 +64,14 @@ int vgic_v2_map_resources(struct kvm *kvm);
 int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
 			     enum vgic_type);
 
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+	if (irq->intid < VGIC_MIN_LPI)
+		return;
+
+	kref_get(&irq->refcount);
+}
+
 #ifdef CONFIG_KVM_ARM_VGIC_V3
 void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);

From c0091073dd775d0446a9f88dda8c9a86b64340b2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 18 Jul 2016 16:16:26 +0100
Subject: [PATCH 276/302] KVM: arm64: vgic-its: Fix handling of indirect tables

The current code will fail on valid indirect tables, and happily
use the ones that are pointing out of the guest RAM. Funny what a
small "!" can do for you...

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index f427fa2f7263f4..d6697c4d81ec5c 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -702,9 +702,9 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its,
 		return false;
 
 	/* Each 1st level entry is represented by a 64-bit value. */
-	if (!kvm_read_guest(kvm,
-			    BASER_ADDRESS(r) + index * sizeof(indirect_ptr),
-			    &indirect_ptr, sizeof(indirect_ptr)))
+	if (kvm_read_guest(kvm,
+			   BASER_ADDRESS(r) + index * sizeof(indirect_ptr),
+			   &indirect_ptr, sizeof(indirect_ptr)))
 		return false;
 
 	/* check the valid bit of the first level entry */

From 7e3963a51563d844fcd3bdc13e2847561b15e8de Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 11:34:53 +0100
Subject: [PATCH 277/302] KVM: arm64: vgic-its: Fix vgic_its_check_device_id BE
 handling

The ITS tables are stored in LE format. If the host is reading
a L1 table entry to check its validity, it must convert it to
the CPU endianness.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index d6697c4d81ec5c..2ac5927b1d9136 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -707,6 +707,8 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its,
 			   &indirect_ptr, sizeof(indirect_ptr)))
 		return false;
 
+	indirect_ptr = le64_to_cpu(indirect_ptr);
+
 	/* check the valid bit of the first level entry */
 	if (!(indirect_ptr & BIT_ULL(63)))
 		return false;

From b90338b7cbb7c8cad8dbd3c4de4e64180ce0d88b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 12:15:19 +0100
Subject: [PATCH 278/302] KVM: arm64: vgic-its: Fix misleading nr_entries in
 vgic_its_check_device_id

The nr_entries variable in vgic_its_check_device_id actually
describe the size of the L1 table, and not the number of
entries in this table.

Rename it to l1_tbl_size, so that we can now change the code
with a better understanding of what is what.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 2ac5927b1d9136..268a0c7ea3a5a9 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -687,18 +687,18 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its,
 				     int device_id)
 {
 	u64 r = its->baser_device_table;
-	int nr_entries = GITS_BASER_NR_PAGES(r) * SZ_64K;
+	int l1_tbl_size = GITS_BASER_NR_PAGES(r) * SZ_64K;
 	int index;
 	u64 indirect_ptr;
 	gfn_t gfn;
 
 
 	if (!(r & GITS_BASER_INDIRECT))
-		return device_id < (nr_entries / GITS_BASER_ENTRY_SIZE(r));
+		return device_id < (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r));
 
 	/* calculate and check the index into the 1st level */
 	index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r));
-	if (index >= (nr_entries / sizeof(u64)))
+	if (index >= (l1_tbl_size / sizeof(u64)))
 		return false;
 
 	/* Each 1st level entry is represented by a 64-bit value. */

From 333a53ff7fb9d836ff4a2b7f266ac9b2bb85e873 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 13:00:49 +0100
Subject: [PATCH 279/302] KVM: arm64: vgic-its: Validate the device table L1
 entry

Checking that the device_id fits if the table, and we must make
sure that the associated memory is also accessible.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 268a0c7ea3a5a9..4943d6aebdd1e6 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -693,8 +693,17 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its,
 	gfn_t gfn;
 
 
-	if (!(r & GITS_BASER_INDIRECT))
-		return device_id < (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r));
+	if (!(r & GITS_BASER_INDIRECT)) {
+		phys_addr_t addr;
+
+		if (device_id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r)))
+			return false;
+
+		addr = BASER_ADDRESS(r) + device_id * GITS_BASER_ENTRY_SIZE(r);
+		gfn = addr >> PAGE_SHIFT;
+
+		return kvm_is_visible_gfn(kvm, gfn);
+	}
 
 	/* calculate and check the index into the 1st level */
 	index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r));

From d6c7f865f00adf98ca79712167fb0f1b9dccb272 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 11:48:47 +0100
Subject: [PATCH 280/302] KVM: arm64: vgic-its: Fix L2 entry validation for
 indirect tables

When checking that the storage address of a device entry is valid,
it is critical to compute the actual address of the entry, rather
than relying on the beginning of the page to match a CPU page of
the same size: for example, if the guest places the table at the
last 64kB boundary of RAM, but RAM size isn't a multiple of 64kB...

Fix this by computing the actual offset of the device ID in the
L2 page, and check the corresponding GFN.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 4943d6aebdd1e6..2faf1f458e8adb 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -727,7 +727,12 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its,
 	 * Any address beyond our supported 48 bits of PA will be caught
 	 * by the actual check in the final step.
 	 */
-	gfn = (indirect_ptr & GENMASK_ULL(51, 16)) >> PAGE_SHIFT;
+	indirect_ptr &= GENMASK_ULL(51, 16);
+
+	/* Find the address of the actual entry */
+	index = device_id % (SZ_64K / GITS_BASER_ENTRY_SIZE(r));
+	indirect_ptr += index * GITS_BASER_ENTRY_SIZE(r);
+	gfn = indirect_ptr >> PAGE_SHIFT;
 
 	return kvm_is_visible_gfn(kvm, gfn);
 }

From 17a21f58ff3e60fef3df788561b65e576a0b494d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 20:01:46 +0100
Subject: [PATCH 281/302] KVM: arm64: vgic-its: Add collection
 allocator/destructor

Instead of spreading random allocations all over the place,
consolidate allocation/init/freeing of collections in a pair
of constructor/destructor.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 92 +++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 38 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 2faf1f458e8adb..d6f68e9c946dd8 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -581,14 +581,45 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
 	return 0;
 }
 
-static void vgic_its_init_collection(struct vgic_its *its,
-				     struct its_collection *collection,
+static int vgic_its_alloc_collection(struct vgic_its *its,
+				     struct its_collection **colp,
 				     u32 coll_id)
 {
+	struct its_collection *collection;
+
+	collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+
 	collection->collection_id = coll_id;
 	collection->target_addr = COLLECTION_NOT_MAPPED;
 
 	list_add_tail(&collection->coll_list, &its->collection_list);
+	*colp = collection;
+
+	return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+	struct its_collection *collection;
+	struct its_device *device;
+	struct its_itte *itte;
+
+	/*
+	 * Clearing the mapping for that collection ID removes the
+	 * entry from the list. If there wasn't any before, we can
+	 * go home early.
+	 */
+	collection = find_collection(its, coll_id);
+	if (!collection)
+		return;
+
+	for_each_lpi_its(device, itte, its)
+		if (itte->collection &&
+		    itte->collection->collection_id == coll_id)
+			itte->collection = NULL;
+
+	list_del(&collection->coll_list);
+	kfree(collection);
 }
 
 /*
@@ -605,6 +636,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 	struct its_device *device;
 	struct its_collection *collection, *new_coll = NULL;
 	int lpi_nr;
+	int ret;
 
 	device = find_its_device(its, device_id);
 	if (!device)
@@ -612,9 +644,10 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 
 	collection = find_collection(its, coll_id);
 	if (!collection) {
-		new_coll = kzalloc(sizeof(struct its_collection), GFP_KERNEL);
-		if (!new_coll)
-			return -ENOMEM;
+		ret = vgic_its_alloc_collection(its, &collection, coll_id);
+		if (ret)
+			return ret;
+		new_coll = collection;
 	}
 
 	if (subcmd == GITS_CMD_MAPTI)
@@ -623,27 +656,22 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 		lpi_nr = event_id;
 	if (lpi_nr < GIC_LPI_OFFSET ||
 	    lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) {
-		kfree(new_coll);
-		return E_ITS_MAPTI_PHYSICALID_OOR;
+		ret = E_ITS_MAPTI_PHYSICALID_OOR;
+		goto err;
 	}
 
 	itte = find_itte(its, device_id, event_id);
 	if (!itte) {
 		itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL);
 		if (!itte) {
-			kfree(new_coll);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto err;
 		}
 
 		itte->event_id	= event_id;
 		list_add_tail(&itte->itte_list, &device->itt_head);
 	}
 
-	if (!collection) {
-		collection = new_coll;
-		vgic_its_init_collection(its, collection, coll_id);
-	}
-
 	itte->collection = collection;
 	itte->lpi = lpi_nr;
 	itte->irq = vgic_add_lpi(kvm, lpi_nr);
@@ -657,6 +685,10 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 	update_lpi_config(kvm, itte->irq, NULL);
 
 	return 0;
+err:
+	if (new_coll)
+		vgic_its_free_collection(its, coll_id);
+	return ret;
 }
 
 /* Requires the its_lock to be held. */
@@ -809,34 +841,18 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
 	if (coll_id >= vgic_its_nr_collection_ids(its))
 		return E_ITS_MAPC_COLLECTION_OOR;
 
-	collection = find_collection(its, coll_id);
-
 	if (!valid) {
-		struct its_device *device;
-		struct its_itte *itte;
-		/*
-		 * Clearing the mapping for that collection ID removes the
-		 * entry from the list. If there wasn't any before, we can
-		 * go home early.
-		 */
-		if (!collection)
-			return 0;
-
-		for_each_lpi_its(device, itte, its)
-			if (itte->collection &&
-			    itte->collection->collection_id == coll_id)
-				itte->collection = NULL;
-
-		list_del(&collection->coll_list);
-		kfree(collection);
+		vgic_its_free_collection(its, coll_id);
 	} else {
+		collection = find_collection(its, coll_id);
+
 		if (!collection) {
-			collection = kzalloc(sizeof(struct its_collection),
-					     GFP_KERNEL);
-			if (!collection)
-				return -ENOMEM;
+			int ret;
 
-			vgic_its_init_collection(its, collection, coll_id);
+			ret = vgic_its_alloc_collection(its, &collection,
+							coll_id);
+			if (ret)
+				return ret;
 			collection->target_addr = target_addr;
 		} else {
 			collection->target_addr = target_addr;

From bb7176449f6da27534a0faf3a67997bf2c3172aa Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 21:35:07 +0100
Subject: [PATCH 282/302] KVM: arm64: vgic-its: Add pointer to corresponding
 kvm_device

Going from the ITS structure to the corresponding KVM structure
would be quite handy at times. The kvm_device pointer that is
passed at create time is quite convenient for this, so let's
keep a copy of it in the vgic_its structure.

This will be put to a good use in subsequent patches.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/kvm/arm_vgic.h       | 1 +
 virt/kvm/arm/vgic/vgic-its.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 4e63a07b9001f5..540da5149ba7f0 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -138,6 +138,7 @@ struct vgic_its {
 	bool			enabled;
 	bool			initialized;
 	struct vgic_io_device	iodev;
+	struct kvm_device	*dev;
 
 	/* These registers correspond to GITS_BASER{0,1} */
 	u64			baser_device_table;
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index d6f68e9c946dd8..dcae567c522dc8 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -1368,6 +1368,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
 	dev->kvm->arch.vgic.has_its = true;
 	its->initialized = false;
 	its->enabled = false;
+	its->dev = dev;
 
 	its->baser_device_table = INITIAL_BASER_VALUE			|
 		((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);

From 6d03a68f8054430cba28e49d9e46c1cd4db39a70 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 21:52:55 +0100
Subject: [PATCH 283/302] KVM: arm64: vgic-its: Turn device_id validation into
 generic ID validation

There is no need to have separate functions to validate devices
and collections, as the architecture doesn't really distinguish the
two, and they are supposed to be managed the same way.

Let's turn the DevID checker into a generic one.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 134 ++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 72 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index dcae567c522dc8..996e3e19b53f7e 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -581,12 +581,73 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
 	return 0;
 }
 
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessbible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id)
+{
+	int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+	int index;
+	u64 indirect_ptr;
+	gfn_t gfn;
+
+	if (!(baser & GITS_BASER_INDIRECT)) {
+		phys_addr_t addr;
+
+		if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser)))
+			return false;
+
+		addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser);
+		gfn = addr >> PAGE_SHIFT;
+
+		return kvm_is_visible_gfn(its->dev->kvm, gfn);
+	}
+
+	/* calculate and check the index into the 1st level */
+	index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+	if (index >= (l1_tbl_size / sizeof(u64)))
+		return false;
+
+	/* Each 1st level entry is represented by a 64-bit value. */
+	if (kvm_read_guest(its->dev->kvm,
+			   BASER_ADDRESS(baser) + index * sizeof(indirect_ptr),
+			   &indirect_ptr, sizeof(indirect_ptr)))
+		return false;
+
+	indirect_ptr = le64_to_cpu(indirect_ptr);
+
+	/* check the valid bit of the first level entry */
+	if (!(indirect_ptr & BIT_ULL(63)))
+		return false;
+
+	/*
+	 * Mask the guest physical address and calculate the frame number.
+	 * Any address beyond our supported 48 bits of PA will be caught
+	 * by the actual check in the final step.
+	 */
+	indirect_ptr &= GENMASK_ULL(51, 16);
+
+	/* Find the address of the actual entry */
+	index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+	indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser);
+	gfn = indirect_ptr >> PAGE_SHIFT;
+
+	return kvm_is_visible_gfn(its->dev->kvm, gfn);
+}
+
 static int vgic_its_alloc_collection(struct vgic_its *its,
 				     struct its_collection **colp,
 				     u32 coll_id)
 {
 	struct its_collection *collection;
 
+	if (!vgic_its_check_id(its, its->baser_coll_table, coll_id))
+		return E_ITS_MAPC_COLLECTION_OOR;
+
 	collection = kzalloc(sizeof(*collection), GFP_KERNEL);
 
 	collection->collection_id = coll_id;
@@ -708,67 +769,6 @@ static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
 	kfree(device);
 }
 
-/*
- * Check whether a device ID can be stored into the guest device tables.
- * For a direct table this is pretty easy, but gets a bit nasty for
- * indirect tables. We check whether the resulting guest physical address
- * is actually valid (covered by a memslot and guest accessbible).
- * For this we have to read the respective first level entry.
- */
-static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its,
-				     int device_id)
-{
-	u64 r = its->baser_device_table;
-	int l1_tbl_size = GITS_BASER_NR_PAGES(r) * SZ_64K;
-	int index;
-	u64 indirect_ptr;
-	gfn_t gfn;
-
-
-	if (!(r & GITS_BASER_INDIRECT)) {
-		phys_addr_t addr;
-
-		if (device_id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r)))
-			return false;
-
-		addr = BASER_ADDRESS(r) + device_id * GITS_BASER_ENTRY_SIZE(r);
-		gfn = addr >> PAGE_SHIFT;
-
-		return kvm_is_visible_gfn(kvm, gfn);
-	}
-
-	/* calculate and check the index into the 1st level */
-	index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r));
-	if (index >= (l1_tbl_size / sizeof(u64)))
-		return false;
-
-	/* Each 1st level entry is represented by a 64-bit value. */
-	if (kvm_read_guest(kvm,
-			   BASER_ADDRESS(r) + index * sizeof(indirect_ptr),
-			   &indirect_ptr, sizeof(indirect_ptr)))
-		return false;
-
-	indirect_ptr = le64_to_cpu(indirect_ptr);
-
-	/* check the valid bit of the first level entry */
-	if (!(indirect_ptr & BIT_ULL(63)))
-		return false;
-
-	/*
-	 * Mask the guest physical address and calculate the frame number.
-	 * Any address beyond our supported 48 bits of PA will be caught
-	 * by the actual check in the final step.
-	 */
-	indirect_ptr &= GENMASK_ULL(51, 16);
-
-	/* Find the address of the actual entry */
-	index = device_id % (SZ_64K / GITS_BASER_ENTRY_SIZE(r));
-	indirect_ptr += index * GITS_BASER_ENTRY_SIZE(r);
-	gfn = indirect_ptr >> PAGE_SHIFT;
-
-	return kvm_is_visible_gfn(kvm, gfn);
-}
-
 /*
  * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
  * Must be called with the its_lock mutex held.
@@ -780,7 +780,7 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
 	bool valid = its_cmd_get_validbit(its_cmd);
 	struct its_device *device;
 
-	if (!vgic_its_check_device_id(kvm, its, device_id))
+	if (!vgic_its_check_id(its, its->baser_device_table, device_id))
 		return E_ITS_MAPD_DEVICE_OOR;
 
 	device = find_its_device(its, device_id);
@@ -812,13 +812,6 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
 	return 0;
 }
 
-static int vgic_its_nr_collection_ids(struct vgic_its *its)
-{
-	u64 r = its->baser_coll_table;
-
-	return (GITS_BASER_NR_PAGES(r) * SZ_64K) / GITS_BASER_ENTRY_SIZE(r);
-}
-
 /*
  * The MAPC command maps collection IDs to redistributors.
  * Must be called with the its_lock mutex held.
@@ -838,9 +831,6 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
 	if (target_addr >= atomic_read(&kvm->online_vcpus))
 		return E_ITS_MAPC_PROCNUM_OOR;
 
-	if (coll_id >= vgic_its_nr_collection_ids(its))
-		return E_ITS_MAPC_COLLECTION_OOR;
-
 	if (!valid) {
 		vgic_its_free_collection(its, coll_id);
 	} else {

From a3e7aa271eec50a674d33bb6eafa51c5f1e5f51f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 17 Jul 2016 22:38:32 +0100
Subject: [PATCH 284/302] KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi
 similar to other handlers

vgic_its_cmd_handle_mapi has an extra "subcmd" argument, which is
already contained in the command buffer that all command handlers
obtain from the command queue. Let's drop it, as it is not that
useful.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 996e3e19b53f7e..ec7e07bc255902 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -688,7 +688,7 @@ static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
  * Must be called with its_lock mutex held.
  */
 static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd, u8 subcmd)
+				    u64 *its_cmd)
 {
 	u32 device_id = its_cmd_get_deviceid(its_cmd);
 	u32 event_id = its_cmd_get_id(its_cmd);
@@ -711,7 +711,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 		new_coll = collection;
 	}
 
-	if (subcmd == GITS_CMD_MAPTI)
+	if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
 		lpi_nr = its_cmd_get_physical_id(its_cmd);
 	else
 		lpi_nr = event_id;
@@ -999,11 +999,10 @@ static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
 static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
 				   u64 *its_cmd)
 {
-	u8 cmd = its_cmd_get_command(its_cmd);
 	int ret = -ENODEV;
 
 	mutex_lock(&its->its_lock);
-	switch (cmd) {
+	switch (its_cmd_get_command(its_cmd)) {
 	case GITS_CMD_MAPD:
 		ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
 		break;
@@ -1011,10 +1010,10 @@ static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
 		ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
 		break;
 	case GITS_CMD_MAPI:
-		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd);
+		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
 		break;
 	case GITS_CMD_MAPTI:
-		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd);
+		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
 		break;
 	case GITS_CMD_MOVI:
 		ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);

From 3a88bded203591d4683aacdbb65cd0f549bc58cb Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 18 Jul 2016 16:27:14 +0100
Subject: [PATCH 285/302] KVM: arm64: vgic-its: Simplify MAPI error handling

If we care to move all the checks that do not involve any memory
allocation, we can simplify the MAPI error handling. Let's do that,
it cannot hurt.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 virt/kvm/arm/vgic/vgic-its.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index ec7e07bc255902..07411cf967b987 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -697,36 +697,34 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 	struct its_device *device;
 	struct its_collection *collection, *new_coll = NULL;
 	int lpi_nr;
-	int ret;
 
 	device = find_its_device(its, device_id);
 	if (!device)
 		return E_ITS_MAPTI_UNMAPPED_DEVICE;
 
-	collection = find_collection(its, coll_id);
-	if (!collection) {
-		ret = vgic_its_alloc_collection(its, &collection, coll_id);
-		if (ret)
-			return ret;
-		new_coll = collection;
-	}
-
 	if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
 		lpi_nr = its_cmd_get_physical_id(its_cmd);
 	else
 		lpi_nr = event_id;
 	if (lpi_nr < GIC_LPI_OFFSET ||
-	    lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) {
-		ret = E_ITS_MAPTI_PHYSICALID_OOR;
-		goto err;
+	    lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+		return E_ITS_MAPTI_PHYSICALID_OOR;
+
+	collection = find_collection(its, coll_id);
+	if (!collection) {
+		int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+		if (ret)
+			return ret;
+		new_coll = collection;
 	}
 
 	itte = find_itte(its, device_id, event_id);
 	if (!itte) {
 		itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL);
 		if (!itte) {
-			ret = -ENOMEM;
-			goto err;
+			if (new_coll)
+				vgic_its_free_collection(its, coll_id);
+			return -ENOMEM;
 		}
 
 		itte->event_id	= event_id;
@@ -746,10 +744,6 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 	update_lpi_config(kvm, itte->irq, NULL);
 
 	return 0;
-err:
-	if (new_coll)
-		vgic_its_free_collection(its, coll_id);
-	return ret;
 }
 
 /* Requires the its_lock to be held. */

From f024ee098476a3e620232e4a78cfac505f121245 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 22 Jun 2016 14:21:59 +1000
Subject: [PATCH 286/302] KVM: PPC: Book3S HV: Pull out TM state save/restore
 into separate procedures

This moves the transactional memory state save and restore sequences
out of the guest entry/exit paths into separate procedures.  This is
so that these sequences can be used in going into and out of nap
in a subsequent patch.

The only code changes here are (a) saving and restore LR on the
stack, since these new procedures get called with a bl instruction,
(b) explicitly saving r1 into the PACA instead of assuming that
HSTATE_HOST_R1(r13) is already set, and (c) removing an unnecessary
and redundant setting of MSR[TM] that should have been removed by
commit 9d4d0bdd9e0a ("KVM: PPC: Book3S HV: Add transactional memory
support", 2013-09-24) but wasn't.

Cc: stable@vger.kernel.org # v3.15+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 449 +++++++++++++-----------
 1 file changed, 237 insertions(+), 212 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 0d246fca157a96..cfa4031f0806e0 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -689,112 +689,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
-	b	skip_tm
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-
-	/* Turn on TM/FP/VSX/VMX so we can restore them. */
-	mfmsr	r5
-	li	r6, MSR_TM >> 32
-	sldi	r6, r6, 32
-	or	r5, r5, r6
-	ori	r5, r5, MSR_FP
-	oris	r5, r5, (MSR_VEC | MSR_VSX)@h
-	mtmsrd	r5
-
-	/*
-	 * The user may change these outside of a transaction, so they must
-	 * always be context switched.
-	 */
-	ld	r5, VCPU_TFHAR(r4)
-	ld	r6, VCPU_TFIAR(r4)
-	ld	r7, VCPU_TEXASR(r4)
-	mtspr	SPRN_TFHAR, r5
-	mtspr	SPRN_TFIAR, r6
-	mtspr	SPRN_TEXASR, r7
-
-	ld	r5, VCPU_MSR(r4)
-	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-	beq	skip_tm	/* TM not active in guest */
-
-	/* Make sure the failure summary is set, otherwise we'll program check
-	 * when we trechkpt.  It's possible that this might have been not set
-	 * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
-	 * host.
-	 */
-	oris	r7, r7, (TEXASR_FS)@h
-	mtspr	SPRN_TEXASR, r7
-
-	/*
-	 * We need to load up the checkpointed state for the guest.
-	 * We need to do this early as it will blow away any GPRs, VSRs and
-	 * some SPRs.
-	 */
-
-	mr	r31, r4
-	addi	r3, r31, VCPU_FPRS_TM
-	bl	load_fp_state
-	addi	r3, r31, VCPU_VRS_TM
-	bl	load_vr_state
-	mr	r4, r31
-	lwz	r7, VCPU_VRSAVE_TM(r4)
-	mtspr	SPRN_VRSAVE, r7
-
-	ld	r5, VCPU_LR_TM(r4)
-	lwz	r6, VCPU_CR_TM(r4)
-	ld	r7, VCPU_CTR_TM(r4)
-	ld	r8, VCPU_AMR_TM(r4)
-	ld	r9, VCPU_TAR_TM(r4)
-	mtlr	r5
-	mtcr	r6
-	mtctr	r7
-	mtspr	SPRN_AMR, r8
-	mtspr	SPRN_TAR, r9
-
-	/*
-	 * Load up PPR and DSCR values but don't put them in the actual SPRs
-	 * till the last moment to avoid running with userspace PPR and DSCR for
-	 * too long.
-	 */
-	ld	r29, VCPU_DSCR_TM(r4)
-	ld	r30, VCPU_PPR_TM(r4)
-
-	std	r2, PACATMSCRATCH(r13) /* Save TOC */
-
-	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
-	li	r5, 0
-	mtmsrd	r5, 1
-
-	/* Load GPRs r0-r28 */
-	reg = 0
-	.rept	29
-	ld	reg, VCPU_GPRS_TM(reg)(r31)
-	reg = reg + 1
-	.endr
-
-	mtspr	SPRN_DSCR, r29
-	mtspr	SPRN_PPR, r30
-
-	/* Load final GPRs */
-	ld	29, VCPU_GPRS_TM(29)(r31)
-	ld	30, VCPU_GPRS_TM(30)(r31)
-	ld	31, VCPU_GPRS_TM(31)(r31)
-
-	/* TM checkpointed state is now setup.  All GPRs are now volatile. */
-	TRECHKPT
-
-	/* Now let's get back the state we need. */
-	HMT_MEDIUM
-	GET_PACA(r13)
-	ld	r29, HSTATE_DSCR(r13)
-	mtspr	SPRN_DSCR, r29
-	ld	r4, HSTATE_KVM_VCPU(r13)
-	ld	r1, HSTATE_HOST_R1(r13)
-	ld	r2, PACATMSCRATCH(r13)
-
-	/* Set the MSR RI since we have our registers back. */
-	li	r5, MSR_RI
-	mtmsrd	r5, 1
-skip_tm:
+	bl	kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
 	/* Load guest PMU registers */
@@ -875,12 +771,6 @@ BEGIN_FTR_SECTION
 	/* Skip next section on POWER7 */
 	b	8f
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-	/* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
-	mfmsr	r8
-	li	r0, 1
-	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-	mtmsrd	r8
-
 	/* Load up POWER8-specific registers */
 	ld	r5, VCPU_IAMR(r4)
 	lwz	r6, VCPU_PSPB(r4)
@@ -1470,106 +1360,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
-	b	2f
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-	/* Turn on TM. */
-	mfmsr	r8
-	li	r0, 1
-	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-	mtmsrd	r8
-
-	ld	r5, VCPU_MSR(r9)
-	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-	beq	1f	/* TM not active in guest. */
-
-	li	r3, TM_CAUSE_KVM_RESCHED
-
-	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
-	li	r5, 0
-	mtmsrd	r5, 1
-
-	/* All GPRs are volatile at this point. */
-	TRECLAIM(R3)
-
-	/* Temporarily store r13 and r9 so we have some regs to play with */
-	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	std	r9, PACATMSCRATCH(r13)
-	ld	r9, HSTATE_KVM_VCPU(r13)
-
-	/* Get a few more GPRs free. */
-	std	r29, VCPU_GPRS_TM(29)(r9)
-	std	r30, VCPU_GPRS_TM(30)(r9)
-	std	r31, VCPU_GPRS_TM(31)(r9)
-
-	/* Save away PPR and DSCR soon so don't run with user values. */
-	mfspr	r31, SPRN_PPR
-	HMT_MEDIUM
-	mfspr	r30, SPRN_DSCR
-	ld	r29, HSTATE_DSCR(r13)
-	mtspr	SPRN_DSCR, r29
-
-	/* Save all but r9, r13 & r29-r31 */
-	reg = 0
-	.rept	29
-	.if (reg != 9) && (reg != 13)
-	std	reg, VCPU_GPRS_TM(reg)(r9)
-	.endif
-	reg = reg + 1
-	.endr
-	/* ... now save r13 */
-	GET_SCRATCH0(r4)
-	std	r4, VCPU_GPRS_TM(13)(r9)
-	/* ... and save r9 */
-	ld	r4, PACATMSCRATCH(r13)
-	std	r4, VCPU_GPRS_TM(9)(r9)
-
-	/* Reload stack pointer and TOC. */
-	ld	r1, HSTATE_HOST_R1(r13)
-	ld	r2, PACATOC(r13)
-
-	/* Set MSR RI now we have r1 and r13 back. */
-	li	r5, MSR_RI
-	mtmsrd	r5, 1
-
-	/* Save away checkpinted SPRs. */
-	std	r31, VCPU_PPR_TM(r9)
-	std	r30, VCPU_DSCR_TM(r9)
-	mflr	r5
-	mfcr	r6
-	mfctr	r7
-	mfspr	r8, SPRN_AMR
-	mfspr	r10, SPRN_TAR
-	std	r5, VCPU_LR_TM(r9)
-	stw	r6, VCPU_CR_TM(r9)
-	std	r7, VCPU_CTR_TM(r9)
-	std	r8, VCPU_AMR_TM(r9)
-	std	r10, VCPU_TAR_TM(r9)
-
-	/* Restore r12 as trap number. */
-	lwz	r12, VCPU_TRAP(r9)
-
-	/* Save FP/VSX. */
-	addi	r3, r9, VCPU_FPRS_TM
-	bl	store_fp_state
-	addi	r3, r9, VCPU_VRS_TM
-	bl	store_vr_state
-	mfspr	r6, SPRN_VRSAVE
-	stw	r6, VCPU_VRSAVE_TM(r9)
-1:
-	/*
-	 * We need to save these SPRs after the treclaim so that the software
-	 * error code is recorded correctly in the TEXASR.  Also the user may
-	 * change these outside of a transaction, so they must always be
-	 * context switched.
-	 */
-	mfspr	r5, SPRN_TFHAR
-	mfspr	r6, SPRN_TFIAR
-	mfspr	r7, SPRN_TEXASR
-	std	r5, VCPU_TFHAR(r9)
-	std	r6, VCPU_TFIAR(r9)
-	std	r7, VCPU_TEXASR(r9)
-2:
+	bl	kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
 	/* Increment yield count if they have a VPA */
@@ -2694,6 +2486,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	mr	r4,r31
 	blr
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save transactional state and TM-related registers.
+ * Called with r9 pointing to the vcpu struct.
+ * This can modify all checkpointed registers, but
+ * restores r1, r2 and r9 (vcpu pointer) before exit.
+ */
+kvmppc_save_tm:
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+
+	/* Turn on TM. */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+	ld	r5, VCPU_MSR(r9)
+	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	beq	1f	/* TM not active in guest. */
+
+	std	r1, HSTATE_HOST_R1(r13)
+	li	r3, TM_CAUSE_KVM_RESCHED
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/* All GPRs are volatile at this point. */
+	TRECLAIM(R3)
+
+	/* Temporarily store r13 and r9 so we have some regs to play with */
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	std	r9, PACATMSCRATCH(r13)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Get a few more GPRs free. */
+	std	r29, VCPU_GPRS_TM(29)(r9)
+	std	r30, VCPU_GPRS_TM(30)(r9)
+	std	r31, VCPU_GPRS_TM(31)(r9)
+
+	/* Save away PPR and DSCR soon so don't run with user values. */
+	mfspr	r31, SPRN_PPR
+	HMT_MEDIUM
+	mfspr	r30, SPRN_DSCR
+	ld	r29, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r29
+
+	/* Save all but r9, r13 & r29-r31 */
+	reg = 0
+	.rept	29
+	.if (reg != 9) && (reg != 13)
+	std	reg, VCPU_GPRS_TM(reg)(r9)
+	.endif
+	reg = reg + 1
+	.endr
+	/* ... now save r13 */
+	GET_SCRATCH0(r4)
+	std	r4, VCPU_GPRS_TM(13)(r9)
+	/* ... and save r9 */
+	ld	r4, PACATMSCRATCH(r13)
+	std	r4, VCPU_GPRS_TM(9)(r9)
+
+	/* Reload stack pointer and TOC. */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	/* Set MSR RI now we have r1 and r13 back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+
+	/* Save away checkpinted SPRs. */
+	std	r31, VCPU_PPR_TM(r9)
+	std	r30, VCPU_DSCR_TM(r9)
+	mflr	r5
+	mfcr	r6
+	mfctr	r7
+	mfspr	r8, SPRN_AMR
+	mfspr	r10, SPRN_TAR
+	std	r5, VCPU_LR_TM(r9)
+	stw	r6, VCPU_CR_TM(r9)
+	std	r7, VCPU_CTR_TM(r9)
+	std	r8, VCPU_AMR_TM(r9)
+	std	r10, VCPU_TAR_TM(r9)
+
+	/* Restore r12 as trap number. */
+	lwz	r12, VCPU_TRAP(r9)
+
+	/* Save FP/VSX. */
+	addi	r3, r9, VCPU_FPRS_TM
+	bl	store_fp_state
+	addi	r3, r9, VCPU_VRS_TM
+	bl	store_vr_state
+	mfspr	r6, SPRN_VRSAVE
+	stw	r6, VCPU_VRSAVE_TM(r9)
+1:
+	/*
+	 * We need to save these SPRs after the treclaim so that the software
+	 * error code is recorded correctly in the TEXASR.  Also the user may
+	 * change these outside of a transaction, so they must always be
+	 * context switched.
+	 */
+	mfspr	r5, SPRN_TFHAR
+	mfspr	r6, SPRN_TFIAR
+	mfspr	r7, SPRN_TEXASR
+	std	r5, VCPU_TFHAR(r9)
+	std	r6, VCPU_TFIAR(r9)
+	std	r7, VCPU_TEXASR(r9)
+
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with r4 pointing to the vcpu struct.
+ * This potentially modifies all checkpointed registers.
+ * It restores r1, r2, r4 from the PACA.
+ */
+kvmppc_restore_tm:
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+
+	/* Turn on TM/FP/VSX/VMX so we can restore them. */
+	mfmsr	r5
+	li	r6, MSR_TM >> 32
+	sldi	r6, r6, 32
+	or	r5, r5, r6
+	ori	r5, r5, MSR_FP
+	oris	r5, r5, (MSR_VEC | MSR_VSX)@h
+	mtmsrd	r5
+
+	/*
+	 * The user may change these outside of a transaction, so they must
+	 * always be context switched.
+	 */
+	ld	r5, VCPU_TFHAR(r4)
+	ld	r6, VCPU_TFIAR(r4)
+	ld	r7, VCPU_TEXASR(r4)
+	mtspr	SPRN_TFHAR, r5
+	mtspr	SPRN_TFIAR, r6
+	mtspr	SPRN_TEXASR, r7
+
+	ld	r5, VCPU_MSR(r4)
+	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	beqlr		/* TM not active in guest */
+	std	r1, HSTATE_HOST_R1(r13)
+
+	/* Make sure the failure summary is set, otherwise we'll program check
+	 * when we trechkpt.  It's possible that this might have been not set
+	 * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+	 * host.
+	 */
+	oris	r7, r7, (TEXASR_FS)@h
+	mtspr	SPRN_TEXASR, r7
+
+	/*
+	 * We need to load up the checkpointed state for the guest.
+	 * We need to do this early as it will blow away any GPRs, VSRs and
+	 * some SPRs.
+	 */
+
+	mr	r31, r4
+	addi	r3, r31, VCPU_FPRS_TM
+	bl	load_fp_state
+	addi	r3, r31, VCPU_VRS_TM
+	bl	load_vr_state
+	mr	r4, r31
+	lwz	r7, VCPU_VRSAVE_TM(r4)
+	mtspr	SPRN_VRSAVE, r7
+
+	ld	r5, VCPU_LR_TM(r4)
+	lwz	r6, VCPU_CR_TM(r4)
+	ld	r7, VCPU_CTR_TM(r4)
+	ld	r8, VCPU_AMR_TM(r4)
+	ld	r9, VCPU_TAR_TM(r4)
+	mtlr	r5
+	mtcr	r6
+	mtctr	r7
+	mtspr	SPRN_AMR, r8
+	mtspr	SPRN_TAR, r9
+
+	/*
+	 * Load up PPR and DSCR values but don't put them in the actual SPRs
+	 * till the last moment to avoid running with userspace PPR and DSCR for
+	 * too long.
+	 */
+	ld	r29, VCPU_DSCR_TM(r4)
+	ld	r30, VCPU_PPR_TM(r4)
+
+	std	r2, PACATMSCRATCH(r13) /* Save TOC */
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/* Load GPRs r0-r28 */
+	reg = 0
+	.rept	29
+	ld	reg, VCPU_GPRS_TM(reg)(r31)
+	reg = reg + 1
+	.endr
+
+	mtspr	SPRN_DSCR, r29
+	mtspr	SPRN_PPR, r30
+
+	/* Load final GPRs */
+	ld	29, VCPU_GPRS_TM(29)(r31)
+	ld	30, VCPU_GPRS_TM(30)(r31)
+	ld	31, VCPU_GPRS_TM(31)(r31)
+
+	/* TM checkpointed state is now setup.  All GPRs are now volatile. */
+	TRECHKPT
+
+	/* Now let's get back the state we need. */
+	HMT_MEDIUM
+	GET_PACA(r13)
+	ld	r29, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r29
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATMSCRATCH(r13)
+
+	/* Set the MSR RI since we have our registers back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+#endif
+
 /*
  * We come here if we get any exception or interrupt while we are
  * executing host real mode code while in guest MMU context.

From 93d17397e4e2182fdaad503e2f9da46202c0f1c3 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 22 Jun 2016 15:52:55 +1000
Subject: [PATCH 287/302] KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE

It turns out that if the guest does a H_CEDE while the CPU is in
a transactional state, and the H_CEDE does a nap, and the nap
loses the architected state of the CPU (which is is allowed to do),
then we lose the checkpointed state of the virtual CPU.  In addition,
the transactional-memory state recorded in the MSR gets reset back
to non-transactional, and when we try to return to the guest, we take
a TM bad thing type of program interrupt because we are trying to
transition from non-transactional to transactional with a hrfid
instruction, which is not permitted.

The result of the program interrupt occurring at that point is that
the host CPU will hang in an infinite loop with interrupts disabled.
Thus this is a denial of service vulnerability in the host which can
be triggered by any guest (and depending on the guest kernel, it can
potentially triggered by unprivileged userspace in the guest).

This vulnerability has been assigned the ID CVE-2016-5412.

To fix this, we save the TM state before napping and restore it
on exit from the nap, when handling a H_CEDE in real mode.  The
case where H_CEDE exits to host virtual mode is already OK (as are
other hcalls which exit to host virtual mode) because the exit
path saves the TM state.

Cc: stable@vger.kernel.org # v3.15+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index cfa4031f0806e0..543124fa11d94b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -2093,6 +2093,13 @@ _GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
 	/* save FP state */
 	bl	kvmppc_save_fp
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	bl	kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
 	/*
 	 * Set DEC to the smaller of DEC and HDEC, so that we wake
 	 * no later than the end of our timeslice (HDEC interrupts
@@ -2169,6 +2176,12 @@ kvm_end_cede:
 	bl	kvmhv_accumulate_time
 #endif
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	bl	kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
 	/* load up FP state */
 	bl	kvmppc_load_fp
 

From 4f2777bc97974b0df9276ee9a85155a9e27a5282 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 13 Jul 2016 17:16:37 -0700
Subject: [PATCH 288/302] kvm: x86: nVMX: maintain internal copy of current
 VMCS

KVM maintains L1's current VMCS in guest memory, at the guest physical
page identified by the argument to VMPTRLD. This makes hairy
time-of-check to time-of-use bugs possible,as VCPUs can be writing
the the VMCS page in memory while KVM is emulating VMLAUNCH and
VMRESUME.

The spec documents that writing to the VMCS page while it is loaded is
"undefined". Therefore it is reasonable to load the entire VMCS into
an internal cache during VMPTRLD and ignore writes to the VMCS page
-- the guest should be using VMREAD and VMWRITE to access the current
VMCS.

To adhere to the spec, KVM should flush the current VMCS during VMPTRLD,
and the target VMCS during VMCLEAR (as given by the operand to VMCLEAR).
Since this implementation of VMCS caching only maintains the the current
VMCS, VMCLEAR will only do a flush if the operand to VMCLEAR is the
current VMCS pointer.

KVM will also flush during VMXOFF, which is not mandated by the spec,
but also not in conflict with the spec.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b61cdadf8623d6..151d2619238c90 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -405,6 +405,12 @@ struct nested_vmx {
 	/* The host-usable pointer to the above */
 	struct page *current_vmcs12_page;
 	struct vmcs12 *current_vmcs12;
+	/*
+	 * Cache of the guest's VMCS, existing outside of guest memory.
+	 * Loaded from guest memory during VMPTRLD. Flushed to guest
+	 * memory during VMXOFF, VMCLEAR, VMPTRLD.
+	 */
+	struct vmcs12 *cached_vmcs12;
 	struct vmcs *current_shadow_vmcs;
 	/*
 	 * Indicates if the shadow vmcs must be updated with the
@@ -858,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
-	return to_vmx(vcpu)->nested.current_vmcs12;
+	return to_vmx(vcpu)->nested.cached_vmcs12;
 }
 
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@ -6987,10 +6993,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+	if (!vmx->nested.cached_vmcs12)
+		return -ENOMEM;
+
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
-		if (!shadow_vmcs)
+		if (!shadow_vmcs) {
+			kfree(vmx->nested.cached_vmcs12);
 			return -ENOMEM;
+		}
 		/* mark vmcs as shadow */
 		shadow_vmcs->revision_id |= (1u << 31);
 		/* init shadow vmcs */
@@ -7061,6 +7073,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 		vmcs_write64(VMCS_LINK_POINTER, -1ull);
 	}
 	vmx->nested.posted_intr_nv = -1;
+
+	/* Flush VMCS12 to guest memory */
+	memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+	       VMCS12_SIZE);
+
 	kunmap(vmx->nested.current_vmcs12_page);
 	nested_release_page(vmx->nested.current_vmcs12_page);
 	vmx->nested.current_vmptr = -1ull;
@@ -7081,6 +7098,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 	nested_release_vmcs12(vmx);
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
+	kfree(vmx->nested.cached_vmcs12);
 	/* Unpin physical memory we referred to in current vmcs02 */
 	if (vmx->nested.apic_access_page) {
 		nested_release_page(vmx->nested.apic_access_page);
@@ -7484,6 +7502,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		vmx->nested.current_vmptr = vmptr;
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
+		/*
+		 * Load VMCS12 from guest memory since it is not already
+		 * cached.
+		 */
+		memcpy(vmx->nested.cached_vmcs12,
+		       vmx->nested.current_vmcs12, VMCS12_SIZE);
+
 		if (enable_shadow_vmcs) {
 			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
 				      SECONDARY_EXEC_SHADOW_VMCS);
@@ -8456,7 +8481,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
 	 * the next L2->L1 exit.
 	 */
 	if (!is_guest_mode(vcpu) ||
-	    !nested_cpu_has2(vmx->nested.current_vmcs12,
+	    !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
 			     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 		vmcs_write64(APIC_ACCESS_ADDR, hpa);
 }

From b80c76ec982c00f2a15668ed71c1d705b6ff95fd Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 29 Jul 2016 18:56:53 -0700
Subject: [PATCH 289/302] KVM: VMX: Add VMCS to CPU's loaded VMCSs before
 VMPTRLD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Kexec needs to know the addresses of all VMCSs that are active on
each CPU, so that it can flush them from the VMCS caches. It is
safe to record superfluous addresses that are not associated with
an active VMCS, but it is not safe to omit an address associated
with an active VMCS.

After a call to vmcs_load, the VMCS that was loaded is active on
the CPU. The VMCS should be added to the CPU's list of active
VMCSs before it is loaded.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 151d2619238c90..b2f559159f3a4e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2205,22 +2205,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
 
 	if (!vmm_exclusive)
 		kvm_cpu_vmxon(phys_addr);
-	else if (vmx->loaded_vmcs->cpu != cpu)
+	else if (!already_loaded)
 		loaded_vmcs_clear(vmx->loaded_vmcs);
 
-	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
-		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
-		vmcs_load(vmx->loaded_vmcs->vmcs);
-	}
-
-	if (vmx->loaded_vmcs->cpu != cpu) {
-		struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-		unsigned long sysenter_esp;
-
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	if (!already_loaded) {
 		local_irq_disable();
 		crash_disable_local_vmclear(cpu);
 
@@ -2235,6 +2227,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 		crash_enable_local_vmclear(cpu);
 		local_irq_enable();
+	}
+
+	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+		vmcs_load(vmx->loaded_vmcs->vmcs);
+	}
+
+	if (!already_loaded) {
+		struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+		unsigned long sysenter_esp;
+
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching

From 6002bdd3e6688954f5f5c1d71b83862cfd7387d9 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:20 +0100
Subject: [PATCH 290/302] MIPS: Fix definition of KSEGX() for 64-bit

The KSEGX() macro is defined to 32-bit sign extend the address argument
and logically AND the result with 0xe0000000, with the final result
usually compared against one of the CKSEG macros. However the literal
0xe0000000 is unsigned as the high bit is set, and is therefore
zero-extended on 64-bit kernels, resulting in the sign extension bits of
the argument being masked to zero. This results in the odd situation
where:

  KSEGX(CKSEG) != CKSEG
  (0xffffffff80000000 & 0x00000000e0000000) != 0xffffffff80000000)

Fix this by 32-bit sign extending the 0xe0000000 literal using
_ACAST32_.

This will help some MIPS KVM code handling 32-bit guest addresses to
work on 64-bit host kernels, but will also affect KSEGX in
dec_kn01_be_backend() on a 64-bit DECstation kernel, and the SiByte DMA
page ops KSEGX check in clear_page() and copy_page() on 64-bit SB1
kernels, neither of which appear to be designed with 64-bit segments in
mind anyway.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cc: linux-mips@linux-mips.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/addrspace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/addrspace.h b/arch/mips/include/asm/addrspace.h
index 3b0e51d5a613f7..c5b04e752e9762 100644
--- a/arch/mips/include/asm/addrspace.h
+++ b/arch/mips/include/asm/addrspace.h
@@ -45,7 +45,7 @@
 /*
  * Returns the kernel segment base of a given address
  */
-#define KSEGX(a)		((_ACAST32_ (a)) & 0xe0000000)
+#define KSEGX(a)		((_ACAST32_(a)) & _ACAST32_(0xe0000000))
 
 /*
  * Returns the physical address of a CKSEGx / XKPHYS address

From cfacaced0cce20859de25b61d672edeb9789a1e9 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:21 +0100
Subject: [PATCH 291/302] MIPS: KVM: Use virt_to_phys() to get commpage PFN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Calculate the PFN of the commpage using virt_to_phys() instead of
CPHYSADDR(). This is more portable as kzalloc() may allocate from XKPhys
instead of KSeg0 on 64-bit kernels, which CPHYSADDR() doesn't handle.
This is sufficient for highmem kernels too since kzalloc() will allocate
from lowmem in KSeg0.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 9699352293e498..f5f8c2acae53a3 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -176,7 +176,7 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
 	unsigned long entrylo[2] = { 0, 0 };
 	unsigned int pair_idx;
 
-	pfn = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
+	pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
 	pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
 	entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
 		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |

From 28cc5bd568745a58bb06291ac336d06b66c66dff Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:22 +0100
Subject: [PATCH 292/302] MIPS: KVM: Use kmap instead of CKSEG0ADDR()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are several unportable uses of CKSEG0ADDR() in MIPS KVM, which
implicitly assume that a host physical address will be in the low 512MB
of the physical address space (accessible in KSeg0). These assumptions
don't hold for highmem or on 64-bit kernels.

When interpreting the guest physical address when reading or overwriting
a trapping instruction, use kmap_atomic() to get a usable virtual
address to access guest memory, which is portable to 64-bit and highmem
kernels.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/dyntrans.c | 17 +++++++++++------
 arch/mips/kvm/mmu.c      |  7 ++++++-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 91ebd2b6034f67..9a16ba2cb48792 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -11,6 +11,7 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/highmem.h>
 #include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
@@ -29,14 +30,18 @@
 static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
 				  union mips_instruction replace)
 {
-	unsigned long kseg0_opc, flags;
+	unsigned long paddr, flags;
+	void *vaddr;
 
 	if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
-		kseg0_opc =
-		    CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
-			       (vcpu, (unsigned long) opc));
-		memcpy((void *)kseg0_opc, (void *)&replace, sizeof(u32));
-		local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
+		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
+							    (unsigned long)opc);
+		vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+		vaddr += paddr & ~PAGE_MASK;
+		memcpy(vaddr, (void *)&replace, sizeof(u32));
+		local_flush_icache_range((unsigned long)vaddr,
+					 (unsigned long)vaddr + 32);
+		kunmap_atomic(vaddr);
 	} else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
 		local_irq_save(flags);
 		memcpy((void *)opc, (void *)&replace, sizeof(u32));
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index ecead748de049f..57319ee57c4fdd 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -9,6 +9,7 @@
  * Authors: Sanjay Lal <sanjayl@kymasys.com>
  */
 
+#include <linux/highmem.h>
 #include <linux/kvm_host.h>
 #include <asm/mmu_context.h>
 
@@ -330,6 +331,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	unsigned long paddr, flags, vpn2, asid;
 	unsigned long va = (unsigned long)opc;
+	void *vaddr;
 	u32 inst;
 	int index;
 
@@ -360,7 +362,10 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 		local_irq_restore(flags);
 	} else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
 		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
-		inst = *(u32 *) CKSEG0ADDR(paddr);
+		vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+		vaddr += paddr & ~PAGE_MASK;
+		inst = *(u32 *)vaddr;
+		kunmap_atomic(vaddr);
 	} else {
 		kvm_err("%s: illegal address: %p\n", __func__, opc);
 		return KVM_INVALID_INST;

From e41637d85846b5b4b6ef5232a22b7e74c03f1be6 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:23 +0100
Subject: [PATCH 293/302] MIPS: KVM: Make entry code MIPS64 friendly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MIPS KVM entry code (originally kvm_locore.S, later locore.S, and
now entry.c) has never quite been right when built for 64-bit, using
32-bit instructions when 64-bit instructions were needed for handling
64-bit registers and pointers. Fix several cases of this now.

The changes roughly fall into the following categories.

- COP0 scratch registers contain guest register values and the VCPU
  pointer, and are themselves full width. Similarly CP0_EPC and
  CP0_BadVAddr registers are full width (even though technically we
  don't support 64-bit guest address spaces with trap & emulate KVM).
  Use MFC0/MTC0 for accessing them.

- Handling of stack pointers and the VCPU pointer must match the pointer
  size of the kernel ABI (always o32 or n64), so use ADDIU.

- The CPU number in thread_info, and the guest_{user,kernel}_asid arrays
  in kvm_vcpu_arch are all 32 bit integers, so use lw (instead of LW) to
  load them.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 48 +++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index 75ba7c2ecb3d00..f4556d0279c6fb 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -120,12 +120,12 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
 					unsigned int frame)
 {
 	/* Save the VCPU scratch register value in cp0_epc of the stack frame */
-	uasm_i_mfc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_MFC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
 	UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
 
 	/* Save the temp scratch register value in cp0_cause of stack frame */
 	if (scratch_tmp[0] == 31) {
-		uasm_i_mfc0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+		UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
 		UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
 	}
 }
@@ -138,11 +138,11 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
 	 * kvm_mips_build_save_scratch().
 	 */
 	UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
-	uasm_i_mtc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
 
 	if (scratch_tmp[0] == 31) {
 		UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
-		uasm_i_mtc0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+		UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
 	}
 }
 
@@ -171,7 +171,7 @@ void *kvm_mips_build_vcpu_run(void *addr)
 	 */
 
 	/* k0/k1 not being used in host kernel context */
-	uasm_i_addiu(&p, K1, SP, -(int)sizeof(struct pt_regs));
+	UASM_i_ADDIU(&p, K1, SP, -(int)sizeof(struct pt_regs));
 	for (i = 16; i < 32; ++i) {
 		if (i == 24)
 			i = 28;
@@ -186,10 +186,10 @@ void *kvm_mips_build_vcpu_run(void *addr)
 	kvm_mips_build_save_scratch(&p, V1, K1);
 
 	/* VCPU scratch register has pointer to vcpu */
-	uasm_i_mtc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
 
 	/* Offset into vcpu->arch */
-	uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+	UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
 
 	/*
 	 * Save the host stack to VCPU, used for exception processing
@@ -252,7 +252,7 @@ static void *kvm_mips_build_enter_guest(void *addr)
 
 	/* Set Guest EPC */
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
-	uasm_i_mtc0(&p, T0, C0_EPC);
+	UASM_i_MTC0(&p, T0, C0_EPC);
 
 	/* Set the ASID for the Guest Kernel */
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
@@ -261,20 +261,20 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
 	uasm_i_xori(&p, T0, T0, KSU_USER);
 	uasm_il_bnez(&p, &r, T0, label_kernel_asid);
-	 uasm_i_addiu(&p, T1, K1,
+	 UASM_i_ADDIU(&p, T1, K1,
 		      offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
 	/* else user */
-	uasm_i_addiu(&p, T1, K1,
+	UASM_i_ADDIU(&p, T1, K1,
 		     offsetof(struct kvm_vcpu_arch, guest_user_asid));
 	uasm_l_kernel_asid(&l, p);
 
 	/* t1: contains the base of the ASID array, need to get the cpu id  */
 	/* smp_processor_id */
-	UASM_i_LW(&p, T2, offsetof(struct thread_info, cpu), GP);
+	uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
 	/* x4 */
 	uasm_i_sll(&p, T2, T2, 2);
 	UASM_i_ADDU(&p, T3, T1, T2);
-	UASM_i_LW(&p, K0, 0, T3);
+	uasm_i_lw(&p, K0, 0, T3);
 #ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
 	/* x sizeof(struct cpuinfo_mips)/4 */
 	uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
@@ -344,11 +344,11 @@ void *kvm_mips_build_exception(void *addr, void *handler)
 	memset(relocs, 0, sizeof(relocs));
 
 	/* Save guest k1 into scratch register */
-	uasm_i_mtc0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+	UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
 
 	/* Get the VCPU pointer from the VCPU scratch register */
-	uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
-	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+	UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
 
 	/* Save guest k0 into VCPU structure */
 	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
@@ -415,13 +415,13 @@ void *kvm_mips_build_exit(void *addr)
 
 	/* Finally save guest k1 to VCPU */
 	uasm_i_ehb(&p);
-	uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
+	UASM_i_MFC0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
 	UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
 
 	/* Now that context has been saved, we can use other registers */
 
 	/* Restore vcpu */
-	uasm_i_mfc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
 	uasm_i_move(&p, S1, A1);
 
 	/* Restore run (vcpu->run) */
@@ -433,10 +433,10 @@ void *kvm_mips_build_exit(void *addr)
 	 * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
 	 * the exception
 	 */
-	uasm_i_mfc0(&p, K0, C0_EPC);
+	UASM_i_MFC0(&p, K0, C0_EPC);
 	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1);
 
-	uasm_i_mfc0(&p, K0, C0_BADVADDR);
+	UASM_i_MFC0(&p, K0, C0_BADVADDR);
 	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr),
 		  K1);
 
@@ -506,7 +506,7 @@ void *kvm_mips_build_exit(void *addr)
 	UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
 
 	/* Saved host state */
-	uasm_i_addiu(&p, SP, SP, -(int)sizeof(struct pt_regs));
+	UASM_i_ADDIU(&p, SP, SP, -(int)sizeof(struct pt_regs));
 
 	/*
 	 * XXXKYMA do we need to load the host ASID, maybe not because the
@@ -529,7 +529,7 @@ void *kvm_mips_build_exit(void *addr)
 	 */
 	UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
 	uasm_i_jalr(&p, RA, T9);
-	 uasm_i_addiu(&p, SP, SP, -CALLFRAME_SIZ);
+	 UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
 
 	uasm_resolve_relocs(relocs, labels);
 
@@ -569,7 +569,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr)
 	 */
 
 	uasm_i_move(&p, K1, S1);
-	uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+	UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
 
 	/*
 	 * Check return value, should tell us if we are returning to the
@@ -603,7 +603,7 @@ static void *kvm_mips_build_ret_to_guest(void *addr)
 	u32 *p = addr;
 
 	/* Put the saved pointer to vcpu (s1) back into the scratch register */
-	uasm_i_mtc0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+	UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
 
 	/* Load up the Guest EBASE to minimize the window where BEV is set */
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
@@ -645,7 +645,7 @@ static void *kvm_mips_build_ret_to_host(void *addr)
 
 	/* EBASE is already pointing to Linux */
 	UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
-	uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs));
+	UASM_i_ADDIU(&p, K1, K1, -(int)sizeof(struct pt_regs));
 
 	/*
 	 * r2/v0 is the return code, shift it down by 2 (arithmetic)

From 1d756942533b2330d8929dd0ea61a81a5d020196 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:24 +0100
Subject: [PATCH 294/302] MIPS: KVM: Set CP0_Status.KX on MIPS64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the KVM entry code to set the CP0_Entry.KX bit on 64-bit kernels.
This is important to allow the entry code, running in kernel mode, to
access the full 64-bit address space right up to the point of entering
the guest, and immediately after exiting the guest, so it can safely
restore & save the guest context from 64-bit segments.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index f4556d0279c6fb..c824bfc4daa0f6 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -61,6 +61,12 @@
 
 #define CALLFRAME_SIZ   32
 
+#ifdef CONFIG_64BIT
+#define ST0_KX_IF_64	ST0_KX
+#else
+#define ST0_KX_IF_64	0
+#endif
+
 static unsigned int scratch_vcpu[2] = { C0_DDATA_LO };
 static unsigned int scratch_tmp[2] = { C0_ERROREPC };
 
@@ -204,7 +210,7 @@ void *kvm_mips_build_vcpu_run(void *addr)
 	 * Setup status register for running the guest in UM, interrupts
 	 * are disabled
 	 */
-	UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV);
+	UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV | ST0_KX_IF_64);
 	uasm_i_mtc0(&p, K0, C0_STATUS);
 	uasm_i_ehb(&p);
 
@@ -217,7 +223,7 @@ void *kvm_mips_build_vcpu_run(void *addr)
 	 * interrupt mask as it was but make sure that timer interrupts
 	 * are enabled
 	 */
-	uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE);
+	uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE | ST0_KX_IF_64);
 	uasm_i_andi(&p, V0, V0, ST0_IM);
 	uasm_i_or(&p, K0, K0, V0);
 	uasm_i_mtc0(&p, K0, C0_STATUS);

From 0d17aea5c27d7d748b1d8116d275b2b17dc5cad6 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:25 +0100
Subject: [PATCH 295/302] MIPS: KVM: Use 64-bit CP0_EBase when appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the KVM entry point to write CP0_EBase as a 64-bit register when
it is 64-bits wide, and to set the WG (write gate) bit if it exists in
order to write bits 63:30 (or 31:30 on MIPS32).

Prior to MIPS64r6 it was UNDEFINED to perform a 64-bit read or write of
a 32-bit COP0 register. Since this is dynamically generated code,
generate the right type of access depending on whether the kernel is
64-bit and cpu_has_ebase_wg.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/entry.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index c824bfc4daa0f6..6a02b3a3fa6513 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -152,6 +152,25 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
 	}
 }
 
+/**
+ * build_set_exc_base() - Assemble code to write exception base address.
+ * @p:		Code buffer pointer.
+ * @reg:	Source register (generated code may set WG bit in @reg).
+ *
+ * Assemble code to modify the exception base address in the EBase register,
+ * using the appropriately sized access and setting the WG bit if necessary.
+ */
+static inline void build_set_exc_base(u32 **p, unsigned int reg)
+{
+	if (cpu_has_ebase_wg) {
+		/* Set WG so that all the bits get written */
+		uasm_i_ori(p, reg, reg, MIPS_EBASE_WG);
+		UASM_i_MTC0(p, reg, C0_EBASE);
+	} else {
+		uasm_i_mtc0(p, reg, C0_EBASE);
+	}
+}
+
 /**
  * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
  * @addr:	Address to start writing code.
@@ -216,7 +235,7 @@ void *kvm_mips_build_vcpu_run(void *addr)
 
 	/* load up the new EBASE */
 	UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
-	uasm_i_mtc0(&p, K0, C0_EBASE);
+	build_set_exc_base(&p, K0);
 
 	/*
 	 * Now that the new EBASE has been loaded, unset BEV, set
@@ -463,7 +482,7 @@ void *kvm_mips_build_exit(void *addr)
 
 	UASM_i_LA_mostly(&p, K0, (long)&ebase);
 	UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
-	uasm_i_mtc0(&p, K0, C0_EBASE);
+	build_set_exc_base(&p, K0);
 
 	if (raw_cpu_has_fpu) {
 		/*
@@ -620,7 +639,7 @@ static void *kvm_mips_build_ret_to_guest(void *addr)
 	uasm_i_or(&p, K0, V1, AT);
 	uasm_i_mtc0(&p, K0, C0_STATUS);
 	uasm_i_ehb(&p);
-	uasm_i_mtc0(&p, T0, C0_EBASE);
+	build_set_exc_base(&p, T0);
 
 	/* Setup status register for running guest in UM */
 	uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE);

From 2a06dab877dee3d4144c3ba32c662db18a1fdd2b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:26 +0100
Subject: [PATCH 296/302] MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fail if the address of the allocated exception base doesn't fit into the
CP0_EBase register. This can happen on MIPS64 if CP0_EBase.WG isn't
implemented but RAM is available outside of the range of KSeg0.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 414b00074e296a..a6ea084b4d9d5f 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -300,6 +300,18 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n",
 		  ALIGN(size, PAGE_SIZE), gebase);
 
+	/*
+	 * Check new ebase actually fits in CP0_EBase. The lack of a write gate
+	 * limits us to the low 512MB of physical address space. If the memory
+	 * we allocate is out of range, just give up now.
+	 */
+	if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) {
+		kvm_err("CP0_EBase.WG required for guest exception base %pK\n",
+			gebase);
+		err = -ENOMEM;
+		goto out_free_gebase;
+	}
+
 	/* Save new ebase */
 	vcpu->arch.guest_ebase = gebase;
 

From 5808844f03b4b31a13a87cf41cc0701718c1b622 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:27 +0100
Subject: [PATCH 297/302] MIPS: KVM: Fix 64-bit big endian dynamic translation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MFC0 and MTC0 instructions in the guest which cause traps can be
replaced with 32-bit loads and stores to the commpage, however on big
endian 64-bit builds the offset needs to have 4 added so as to
load/store the least significant half of the long instead of the most
significant half.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/dyntrans.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 9a16ba2cb48792..c793ff19a8a87a 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -103,6 +103,10 @@ int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
 		mfc0_inst.i_format.rt = inst.c0r_format.rt;
 		mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
 			offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+			mfc0_inst.i_format.simmediate |= 4;
+#endif
 	}
 
 	return kvm_mips_trans_replace(vcpu, opc, mfc0_inst);
@@ -121,6 +125,10 @@ int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
 	mtc0_inst.i_format.rt = inst.c0r_format.rt;
 	mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
 		offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+		mtc0_inst.i_format.simmediate |= 4;
+#endif
 
 	return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
 }

From 172e02d1474d5c37a8728ccdfdc731c118366144 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:28 +0100
Subject: [PATCH 298/302] MIPS: KVM: Sign extend MFC0/RDHWR results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When emulating MFC0 instructions to load 32-bit values from guest COP0
registers and the RDHWR instruction to read the CC (Count) register,
sign extend the result to comply with the MIPS64 architecture. The
result must be in canonical 32-bit form or the guest may malfunction.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index be18dfe9ecaa21..6eb52b9c98183b 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1072,14 +1072,15 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 #endif
 			/* Get reg */
 			if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
-				vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu);
+				vcpu->arch.gprs[rt] =
+				    (s32)kvm_mips_read_count(vcpu);
 			} else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
 				vcpu->arch.gprs[rt] = 0x0;
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 				kvm_mips_trans_mfc0(inst, opc, vcpu);
 #endif
 			} else {
-				vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+				vcpu->arch.gprs[rt] = (s32)cop0->reg[rd][sel];
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
 				kvm_mips_trans_mfc0(inst, opc, vcpu);
@@ -2380,7 +2381,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
 					     current_cpu_data.icache.linesz);
 			break;
 		case MIPS_HWR_CC:		/* Read count register */
-			arch->gprs[rt] = kvm_mips_read_count(vcpu);
+			arch->gprs[rt] = (s32)kvm_mips_read_count(vcpu);
 			break;
 		case MIPS_HWR_CCRES:		/* Count register resolution */
 			switch (current_cpu_data.cputype) {

From 8296963e6e8c656c4d91dfa7245e49672aa9675e Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:29 +0100
Subject: [PATCH 299/302] MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_mips_trans_replace() passes a pointer to KVM_GUEST_KSEGX(). This
breaks on 64-bit builds due to the cast of that 64-bit pointer to a
different sized 32-bit int. Cast the pointer argument to an unsigned
long to work around the warning.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/dyntrans.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index c793ff19a8a87a..d280894915ed0d 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -33,7 +33,7 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
 	unsigned long paddr, flags;
 	void *vaddr;
 
-	if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
+	if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
 		paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
 							    (unsigned long)opc);
 		vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));

From a700434d80eab4c42380a5c57745aff07493784c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:30 +0100
Subject: [PATCH 300/302] MIPS: KVM: Reset CP0_PageMask during host TLB flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM sometimes flushes host TLB entries, reading each one to check if it
corresponds to a guest KSeg0 address. In the absence of EntryHi.EHInv
bits to invalidate the whole entry, the entries will be set to unique
virtual addresses in KSeg0 (which is not TLB mapped), spaced 2*PAGE_SIZE
apart.

The TLB read however will clobber the CP0_PageMask register with
whatever page size that TLB entry had, and that same page size will be
written back into the TLB entry along with the unique address.

This would cause breakage when transparent huge pages are enabled on
64-bit host kernels, since huge page entries will overlap other nearby
entries when separated by only 2*PAGE_SIZE, causing a machine check
exception.

Fix this by restoring the old CP0_PageMask value (which should be set to
the normal page size) after reading the TLB entry if we're going to go
ahead and invalidate it.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/tlb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index f5f8c2acae53a3..254377d8e0b992 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -332,6 +332,8 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
 			/* Don't blow away guest kernel entries */
 			if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
 				continue;
+
+			write_c0_pagemask(old_pagemask);
 		}
 
 		/* Make sure all entries differ. */

From 40a2df49858eb40ccfe979da69b74d8b203a869b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 8 Jul 2016 11:53:31 +0100
Subject: [PATCH 301/302] MIPS: Select HAVE_KVM for MIPS64_R{2,6}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are now able to support KVM T&E with MIPS32 guests on some MIPS64r2
and MIPS64r6 hosts, so select HAVE_KVM so it can be enabled.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ac91939b9b7581..29867139851e1d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1488,6 +1488,7 @@ config CPU_MIPS64_R2
 	select CPU_SUPPORTS_HIGHMEM
 	select CPU_SUPPORTS_HUGEPAGES
 	select CPU_SUPPORTS_MSA
+	select HAVE_KVM
 	help
 	  Choose this option to build a kernel for release 2 or later of the
 	  MIPS64 architecture.  Many modern embedded systems with a 64-bit
@@ -1505,6 +1506,7 @@ config CPU_MIPS64_R6
 	select CPU_SUPPORTS_MSA
 	select GENERIC_CSUM
 	select MIPS_O32_FP64_SUPPORT if MIPS32_O32
+	select HAVE_KVM
 	help
 	  Choose this option to build a kernel for release 6 or later of the
 	  MIPS64 architecture.  New MIPS processors, starting with the Warrior

From 23528bb21ee2c9b27f3feddd77a2a3351a8df148 Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sam.bobroff@au1.ibm.com>
Date: Wed, 20 Jul 2016 13:41:36 +1000
Subject: [PATCH 302/302] KVM: PPC: Introduce KVM_CAP_PPC_HTM

Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to
determine if a PowerPC KVM guest should use HTM (Hardware Transactional
Memory).

This will be used by QEMU to populate the pa-features bits in the
guest's device tree.

Signed-off-by: Sam Bobroff <sam.bobroff@au1.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/powerpc.c | 4 ++++
 include/uapi/linux/kvm.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1ac036e45ed4f8..6ce40dd6fe51a1 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = 1;
 		break;
 #endif
+	case KVM_CAP_PPC_HTM:
+		r = cpu_has_feature(CPU_FTR_TM_COMP) &&
+		    is_kvmppc_hv_enabled(kvm);
+		break;
 	default:
 		r = 0;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8f2756c263d475..e98bb4cce6391c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -869,6 +869,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_X2APIC_API 129
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
 
 #ifdef KVM_CAP_IRQ_ROUTING