diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1a2b5210cdbf58..38e327d4b47903 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -182,6 +182,9 @@ is dependent on the CPU capability and the kernel configuration. The limit can be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION ioctl() at run-time. +Creation of the VM will fail if the requested IPA size (whether it is +implicit or explicit) is unsupported on the host. + Please note that configuring the IPA size does not affect the capability exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects size of the address translated by the stage2 level (guest physical to diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 22d933e9b59e52..a7ab84f781f73d 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -47,10 +47,10 @@ #define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 +#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 #define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 #define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 @@ -183,16 +183,16 @@ DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) extern void __kvm_flush_vm_context(void); +extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level); extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); -extern void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu); extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); -extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_get_gic_config(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index c0450828378b52..32ae676236b6b4 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -83,6 +83,11 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt); void __debug_switch_to_guest(struct kvm_vcpu *vcpu); void __debug_switch_to_host(struct kvm_vcpu *vcpu); +#ifdef __KVM_NVHE_HYPERVISOR__ +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu); +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); +#endif + void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); @@ -97,7 +102,8 @@ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ -void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); +void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 23f1a557bd9f04..5aa9ed1e9ec618 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -101,6 +101,9 @@ KVM_NVHE_ALIAS(__stop___kvm_ex_table); /* Array containing bases of nVHE per-CPU memory regions. */ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); +/* PMU available static key */ +KVM_NVHE_ALIAS(kvm_arm_pmu_available); + #endif /* CONFIG_KVM */ #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fc4c95dd2d2617..7f06ba76698d84 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -385,11 +385,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) last_ran = this_cpu_ptr(mmu->last_vcpu_ran); /* + * We guarantee that both TLBs and I-cache are private to each + * vcpu. If detecting that a vcpu from the same VM has + * previously run on the same physical CPU, call into the + * hypervisor code to nuke the relevant contexts. + * * We might get preempted before the vCPU actually runs, but * over-invalidation doesn't affect correctness. */ if (*last_ran != vcpu->vcpu_id) { - kvm_call_hyp(__kvm_tlb_flush_local_vmid, mmu); + kvm_call_hyp(__kvm_flush_cpu_context, mmu); *last_ran = vcpu->vcpu_id; } diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index b0afad7a99c6e8..e831d3dfd50d7c 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -85,8 +85,10 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // If the hyp context is loaded, go straight to hyp_panic get_loaded_vcpu x0, x1 - cbz x0, hyp_panic + cbnz x0, 1f + b hyp_panic +1: // The hyp context is saved so make sure it is restored to allow // hyp_panic to run at hyp and, subsequently, panic to run in the host. // This makes use of __guest_exit to avoid duplication but sets the @@ -94,7 +96,7 @@ SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) // current state is saved to the guest context but it will only be // accurate if the guest had been completely restored. adr_this_cpu x0, kvm_hyp_ctxt, x1 - adr x1, hyp_panic + adr_l x1, hyp_panic str x1, [x0, #CPU_XREG_OFFSET(30)] get_vcpu_ptr x1, x0 @@ -146,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Now restore the hyp regs restore_callee_saved_regs x2 - set_loaded_vcpu xzr, x1, x2 + set_loaded_vcpu xzr, x2, x3 alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 54f4860cd87c06..6c1f51f25eb340 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -90,15 +90,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - write_sysreg(0, pmselr_el0); - write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) { + write_sysreg(0, pmselr_el0); + write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + } write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } static inline void __deactivate_traps_common(void) { write_sysreg(0, hstr_el2); - write_sysreg(0, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) + write_sysreg(0, pmuserenr_el0); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 91a711aa8382e1..f401724f12ef75 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -58,16 +58,24 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); } -void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); +} + +void __debug_switch_to_guest(struct kvm_vcpu *vcpu) +{ __debug_switch_to_guest_common(vcpu); } -void __debug_switch_to_host(struct kvm_vcpu *vcpu) +void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); +} + +void __debug_switch_to_host(struct kvm_vcpu *vcpu) +{ __debug_switch_to_host_common(vcpu); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 6585a7cbbc5662..5d94584840ccfe 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -71,7 +71,8 @@ SYM_FUNC_START(__host_enter) SYM_FUNC_END(__host_enter) /* - * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); + * void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, + * u64 elr, u64 par); */ SYM_FUNC_START(__hyp_do_panic) /* Prepare and exit to the host's panic funciton. */ @@ -82,9 +83,11 @@ SYM_FUNC_START(__hyp_do_panic) hyp_kimg_va lr, x6 msr elr_el2, lr - /* Set the panic format string. Use the, now free, LR as scratch. */ - ldr lr, =__hyp_panic_string - hyp_kimg_va lr, x6 + mov x29, x0 + + /* Load the format string into x0 and arguments into x1-7 */ + ldr x0, =__hyp_panic_string + hyp_kimg_va x0, x6 /* Load the format arguments into x1-7. */ mov x6, x3 @@ -94,9 +97,7 @@ SYM_FUNC_START(__hyp_do_panic) mrs x5, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ - cmp x0, xzr - mov x0, lr - b.eq __host_enter_without_restoring + cbz x29, __host_enter_without_restoring b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f012f8665ecc1c..936328207bde02 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -46,11 +46,11 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } -static void handle___kvm_tlb_flush_local_vmid(struct kvm_cpu_context *host_ctxt) +static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); - __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); + __kvm_flush_cpu_context(kern_hyp_va(mmu)); } static void handle___kvm_timer_set_cntvoff(struct kvm_cpu_context *host_ctxt) @@ -67,9 +67,9 @@ static void handle___kvm_enable_ssbs(struct kvm_cpu_context *host_ctxt) write_sysreg_el2(tmp, SYS_SCTLR); } -static void handle___vgic_v3_get_ich_vtr_el2(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt) { - cpu_reg(host_ctxt, 1) = __vgic_v3_get_ich_vtr_el2(); + cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config(); } static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt) @@ -115,10 +115,10 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), - HANDLE_FUNC(__kvm_tlb_flush_local_vmid), + HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_ich_vtr_el2), + HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__vgic_v3_read_vmcr), HANDLE_FUNC(__vgic_v3_write_vmcr), HANDLE_FUNC(__vgic_v3_init_lrs), diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f3d0e9eca56cd2..68ab6b4d514140 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -192,6 +192,14 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmu_switch_needed = __pmu_switch_to_guest(host_ctxt); __sysreg_save_state_nvhe(host_ctxt); + /* + * We must flush and disable the SPE buffer for nVHE, as + * the translation regime(EL1&0) is going to be loaded with + * that of the guest. And we must do this before we change the + * translation regime to EL2 (via MDCR_EL2_E2PB == 0) and + * before we load guest Stage1. + */ + __debug_save_host_buffers_nvhe(vcpu); __adjust_pc(vcpu); @@ -234,11 +242,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) __fpsimd_save_fpexc32(vcpu); + __debug_switch_to_host(vcpu); /* * This must come after restoring the host sysregs, since a non-VHE * system may enable SPE here and make use of the TTBRs. */ - __debug_switch_to_host(vcpu); + __debug_restore_host_buffers_nvhe(vcpu); if (pmu_switch_needed) __pmu_switch_to_host(host_ctxt); @@ -257,7 +266,6 @@ void __noreturn hyp_panic(void) u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg_par(); - bool restore_host = true; struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; @@ -271,7 +279,7 @@ void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } - __hyp_do_panic(restore_host, spsr, elr, par); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index fbde89a2c6e838..229b06748c2084 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -123,7 +123,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -131,6 +131,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4d177ce1d536f3..926fc07074f57b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -223,6 +223,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; if (!table) { + data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); data->addr += kvm_granule_size(level); goto out; } diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 80406f463c28fd..ee3682b9873c44 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -405,9 +405,45 @@ void __vgic_v3_init_lrs(void) __gic_v3_set_lr(0, i); } -u64 __vgic_v3_get_ich_vtr_el2(void) +/* + * Return the GIC CPU configuration: + * - [31:0] ICH_VTR_EL2 + * - [62:32] RES0 + * - [63] MMIO (GICv2) capable + */ +u64 __vgic_v3_get_gic_config(void) { - return read_gicreg(ICH_VTR_EL2); + u64 val, sre = read_gicreg(ICC_SRE_EL1); + unsigned long flags = 0; + + /* + * To check whether we have a MMIO-based (GICv2 compatible) + * CPU interface, we need to disable the system register + * view. To do that safely, we have to prevent any interrupt + * from firing (which would be deadly). + * + * Note that this only makes sense on VHE, as interrupts are + * already masked for nVHE as part of the exception entry to + * EL2. + */ + if (has_vhe()) + flags = local_daif_save(); + + write_gicreg(0, ICC_SRE_EL1); + isb(); + + val = read_gicreg(ICC_SRE_EL1); + + write_gicreg(sre, ICC_SRE_EL1); + isb(); + + if (has_vhe()) + local_daif_restore(flags); + + val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); + val |= read_gicreg(ICH_VTR_EL2); + + return val; } u64 __vgic_v3_read_vmcr(void) diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index fd7895945bbc6f..66f17349f0c369 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -127,7 +127,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_host(&cxt); } -void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) +void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; @@ -135,6 +135,7 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalle1); + asm volatile("ic iallu"); dsb(nsh); isb(); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 77cb2d28f2a43a..8711894db8c224 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1312,8 +1312,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ - if (memslot->base_gfn + memslot->npages >= - (kvm_phys_size(kvm) >> PAGE_SHIFT)) + if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; mmap_read_lock(current->mm); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index d45b8b9a441518..739164324afedb 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -11,6 +11,8 @@ #include +DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + static int kvm_is_in_guest(void) { return kvm_get_running_vcpu() != NULL; @@ -48,6 +50,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { + /* + * Check if HW_PERF_EVENTS are supported by checking the number of + * hardware performance counters. This could ensure the presence of + * a physical PMU and CONFIG_PERF_EVENT is selected. + */ + if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0) + static_branch_enable(&kvm_arm_pmu_available); + return perf_register_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e9ec08b0b07033..e32c6e139a09d3 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -823,16 +823,6 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return val & mask; } -bool kvm_arm_support_pmu_v3(void) -{ - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - return (perf_num_counters() > 0); -} - int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_has_pmu(vcpu)) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e81c7ec9e10202..bd354cd45d2860 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -326,10 +326,9 @@ int kvm_set_ipa_limit(void) } kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); - WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, - "KVM IPA Size Limit (%d bits) is smaller than default size\n", - kvm_ipa_limit); - kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); + kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit, + ((kvm_ipa_limit < KVM_PHYS_SHIFT) ? + " (Reduced IPA size, limited VM/VMM compatibility)" : "")); return 0; } @@ -358,6 +357,11 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; + if (phys_shift > kvm_ipa_limit) { + pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", + current->comm); + return -EINVAL; + } } mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 52915b34235143..6f530925a23144 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -574,9 +574,13 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); */ int vgic_v3_probe(const struct gic_kvm_info *info) { - u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); + u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); + bool has_v2; int ret; + has_v2 = ich_vtr_el2 >> 63; + ich_vtr_el2 = (u32)ich_vtr_el2; + /* * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... @@ -594,13 +598,15 @@ int vgic_v3_probe(const struct gic_kvm_info *info) gicv4_enable ? "en" : "dis"); } + kvm_vgic_global_state.vcpu_base = 0; + if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); - kvm_vgic_global_state.vcpu_base = 0; + } else if (!has_v2) { + pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); - kvm_vgic_global_state.vcpu_base = 0; } else { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 877a4025d8da08..9bc091ecaaeb97 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -963,7 +963,7 @@ struct kvm_arch { struct kvm_pit *vpit; atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; - struct kvm_apic_map *apic_map; + struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_page_done; @@ -1036,7 +1036,7 @@ struct kvm_arch { bool bus_lock_detection_enabled; - struct kvm_pmu_event_filter *pmu_event_filter; + struct kvm_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa593743acf679..1fc0962c89c082 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; + kvmclock_init_mem(); - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 45d40bfacb7cbe..cc369b9ad8f11c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - kvm_wait_lapic_expire(vcpu); + /* + * Ensure the guest's timer has truly expired before posting an + * interrupt. Open code the relevant checks to avoid querying + * lapic_timer_int_injected(), which will be false since the + * interrupt isn't yet injected. Waiting until after injecting + * is not an option since that won't help a posted interrupt. + */ + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } @@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c926c6b899a106..d78915019b082f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt, cpu_relax(); } } else { + /* + * If the SPTE is not MMU-present, there is no backing + * page associated with the SPTE and so no side effects + * that need to be recorded, and exclusive ownership of + * mmu_lock ensures the SPTE can't be made present. + * Note, zapping MMIO SPTEs is also unnecessary as they + * are guarded by the memslots generation, not by being + * unreachable. + */ old_child_spte = READ_ONCE(*sptep); + if (!is_shadow_present_pte(old_child_spte)) + continue; /* * Marking the SPTE as a removed SPTE is not diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index baee91c1e93643..58a45bb139f88a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs { { .index = MSR_INVALID, .always = false }, }; -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -bool npt_enabled = true; -#else -bool npt_enabled; -#endif - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444); static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; module_param(pause_filter_count_max, ushort, 0444); -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); +/* + * Use nested page tables by default. Note, NPT may get forced off by + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. + */ +bool npt_enabled = true; +module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; @@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void) goto err; } - if (!boot_cpu_has(X86_FEATURE_NPT)) + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) npt_enabled = false; - if (npt_enabled && !npt) + if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a20ce60152ea9..47e021bdcc94ab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10601,7 +10601,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } else { if (!slot || !slot->npages) - return 0; + return NULL; old_npages = slot->npages; hva = slot->userspace_addr; diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 8dcb3e1477bc98..6fd3cda608e4a6 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -13,6 +13,13 @@ #define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) #define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1) +DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); + +static __always_inline bool kvm_arm_support_pmu_v3(void) +{ + return static_branch_likely(&kvm_arm_pmu_available); +} + #ifdef CONFIG_HW_PERF_EVENTS struct kvm_pmc { @@ -47,7 +54,6 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx); -bool kvm_arm_support_pmu_v3(void); int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, @@ -87,7 +93,6 @@ static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) {} -static inline bool kvm_arm_support_pmu_v3(void) { return false; } static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) {