Skip to content

Commit

Permalink
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Browse files Browse the repository at this point in the history
Pull more kvm updates from Paolo Bonzini:
 "Generic:

   - selftest compilation fix for non-x86

   - KVM: avoid warning on s390 in mark_page_dirty

 x86:

   - fix page write-protection bug and improve comments

   - use binary search to lookup the PMU event filter, add test

   - enable_pmu module parameter support for Intel CPUs

   - switch blocked_vcpu_on_cpu_lock to raw spinlock

   - cleanups of blocked vCPU logic

   - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression)

   - various small fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (46 commits)
  docs: kvm: fix WARNINGs from api.rst
  selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c
  selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c
  kvm: selftests: Do not indent with spaces
  kvm: selftests: sync uapi/linux/kvm.h with Linux header
  selftests: kvm: add amx_test to .gitignore
  KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled
  KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops
  KVM: SVM: Drop AVIC's intermediate avic_set_running() helper
  KVM: VMX: Don't do full kick when handling posted interrupt wakeup
  KVM: VMX: Fold fallback path into triggering posted IRQ helper
  KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ
  KVM: VMX: Don't do full kick when triggering posted interrupt "fails"
  KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU
  KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption
  KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path
  KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs
  KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode
  KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks
  KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers
  ...
  • Loading branch information
torvalds committed Jan 22, 2022
2 parents dc5341f + e2e83a7 commit 636b528
Show file tree
Hide file tree
Showing 36 changed files with 1,425 additions and 632 deletions.
6 changes: 3 additions & 3 deletions Documentation/virt/kvm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
The Stats Data block contains an array of 64-bit values in the same order
as the descriptors in Descriptors block.

4.42 KVM_GET_XSAVE2
------------------
4.134 KVM_GET_XSAVE2
--------------------

:Capability: KVM_CAP_XSAVE2
:Architectures: x86
Expand Down Expand Up @@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as
limit the attack surface on KVM's MSR emulation code.

8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
-----------------------------
-------------------------------------

Architectures: x86

Expand Down
3 changes: 1 addition & 2 deletions arch/x86/include/asm/kvm-x86-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
KVM_X86_OP_NULL(tlb_remote_flush_with_range)
KVM_X86_OP(tlb_flush_gva)
KVM_X86_OP(tlb_flush_guest)
KVM_X86_OP(vcpu_pre_run)
KVM_X86_OP(run)
KVM_X86_OP_NULL(handle_exit)
KVM_X86_OP_NULL(skip_emulated_instruction)
Expand Down Expand Up @@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
KVM_X86_OP_NULL(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_NULL(update_cpu_dirty_logging)
KVM_X86_OP_NULL(pre_block)
KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking)
KVM_X86_OP_NULL(vcpu_unblocking)
KVM_X86_OP_NULL(update_pi_irte)
Expand Down
13 changes: 1 addition & 12 deletions arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
*/
void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);

int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
Expand Down Expand Up @@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops;

/*
* Architecture specific hooks for vCPU blocking due to
* HLT instruction.
* Returns for .pre_block():
* - 0 means continue to block the vCPU.
* - 1 means we cannot block the vCPU since some event
* happens during this period, such as, 'ON' bit in
* posted-interrupts descriptor is set.
*/
int (*pre_block)(struct kvm_vcpu *vcpu);
void (*post_block)(struct kvm_vcpu *vcpu);

void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);

Expand Down
79 changes: 66 additions & 13 deletions arch/x86/kvm/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
}

/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int nent)
{
struct kvm_cpuid_entry2 *orig;
int i;

if (nent != vcpu->arch.cpuid_nent)
return -EINVAL;

for (i = 0; i < nent; i++) {
orig = &vcpu->arch.cpuid_entries[i];
if (e2[i].function != orig->function ||
e2[i].index != orig->index ||
e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
return -EINVAL;
}

return 0;
}

static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
{
u32 function;
Expand All @@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
}
}

static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
struct kvm_cpuid_entry2 *entries, int nent)
{
u32 base = vcpu->arch.kvm_cpuid_base;

if (!base)
return NULL;

return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
}

static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
{
return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
vcpu->arch.cpuid_nent);
}

void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
Expand All @@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.pv_cpuid.features = best->eax;
}

void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
int nent)
{
struct kvm_cpuid_entry2 *best;

best = kvm_find_cpuid_entry(vcpu, 1, 0);
best = cpuid_entry2_find(entries, nent, 1, 0);
if (best) {
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
Expand All @@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
}

best = kvm_find_cpuid_entry(vcpu, 7, 0);
best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));

best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);

best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
best = cpuid_entry2_find(entries, nent, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);

best = kvm_find_kvm_cpuid_features(vcpu);
best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);

if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
best = cpuid_entry2_find(entries, nent, 0x1, 0);
if (best)
cpuid_entry_change(best, X86_FEATURE_MWAIT,
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
}

void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
__kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);

static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
Expand Down Expand Up @@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
{
int r;

__kvm_update_cpuid_runtime(vcpu, e2, nent);

/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
* the core vCPU model on the fly. It would've been better to forbid any
* KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
* some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
if (vcpu->arch.last_vmentry_cpu != -1)
return kvm_cpuid_check_equal(vcpu, e2, nent);

r = kvm_check_cpuid(vcpu, e2, nent);
if (r)
return r;
Expand All @@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_nent = nent;

kvm_update_kvm_cpuid_base(vcpu);
kvm_update_cpuid_runtime(vcpu);
kvm_vcpu_after_set_cpuid(vcpu);

return 0;
Expand Down Expand Up @@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
perf_get_x86_pmu_capability(&cap);

/*
* Only support guest architectural pmu on a host
* with architectural pmu.
* The guest architecture pmu is only supported if the architecture
* pmu exists on the host and the module parameters allow it.
*/
if (!cap.version)
if (!cap.version || !enable_pmu)
memset(&cap, 0, sizeof(cap));

eax.split.version_id = min(cap.version, 2);
Expand Down Expand Up @@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
--array->nent;
continue;
}

if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
entry->ecx &= ~BIT_ULL(2);
entry->edx = 0;
}
break;
Expand Down
2 changes: 0 additions & 2 deletions arch/x86/kvm/lapic.c
Original file line number Diff line number Diff line change
Expand Up @@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
restart_apic_timer(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);

void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
{
Expand All @@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
start_sw_timer(apic);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);

void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
{
Expand Down
31 changes: 22 additions & 9 deletions arch/x86/kvm/mmu/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
continue;

flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,

PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
start, end - 1, true, flush);
}
Expand Down Expand Up @@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}

/*
* We can flush all the TLBs out of the mmu lock without TLB
* corruption since we just change the spte from writable to
* readonly so that we only need to care the case of changing
* spte from present to present (changing the spte from present
* to nonpresent will flush all the TLBs immediately), in other
* words, the only case we care is mmu_spte_update() where we
* have checked Host-writable | MMU-writable instead of
* PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
* anymore.
* Flush TLBs if any SPTEs had to be write-protected to ensure that
* guest writes are reflected in the dirty bitmap before the memslot
* update completes, i.e. before enabling dirty logging is visible to
* userspace.
*
* Perform the TLB flush outside the mmu_lock to reduce the amount of
* time the lock is held. However, this does mean that another CPU can
* now grab mmu_lock and encounter a write-protected SPTE while CPUs
* still have a writable mapping for the associated GFN in their TLB.
*
* This is safe but requires KVM to be careful when making decisions
* based on the write-protection status of an SPTE. Specifically, KVM
* also write-protects SPTEs to monitor changes to guest page tables
* during shadow paging, and must guarantee no CPUs can write to those
* page before the lock is dropped. As mentioned in the previous
* paragraph, a write-protected SPTE is no guarantee that CPU cannot
* perform writes. So to determine if a TLB flush is truly required, KVM
* will clear a separate software-only bit (MMU-writable) and skip the
* flush if-and-only-if this bit was already clear.
*
* See DEFAULT_SPTE_MMU_WRITEABLE for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
Expand Down
1 change: 1 addition & 0 deletions arch/x86/kvm/mmu/spte.c
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)

new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~shadow_host_writable_mask;
new_spte &= ~shadow_mmu_writable_mask;

new_spte = mark_spte_for_access_track(new_spte);

Expand Down
42 changes: 36 additions & 6 deletions arch/x86/kvm/mmu/spte.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)

/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the
Expand All @@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));

/*
* *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
* writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
* that map guest pages in read-only memslots and read-only VMAs.
*
* Invariants:
* - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
*
*
* *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
* allows writes to the guest page mapped by the SPTE. This bit is cleared when
* the guest page mapped by the SPTE contains a page table that is being
* monitored for shadow paging. In this case the SPTE can only be made writable
* by unsyncing the shadow page under the mmu_lock.
*
* Invariants:
* - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
* - If MMU-writable is set, Host-writable must be set.
*
* If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
* to track writes for dirty logging. For such SPTEs, KVM will locklessly set
* PT_WRITABLE_MASK upon the next write from the guest and record the write in
* the dirty log (see fast_page_fault()).
*/

/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)

/*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
* to not overlap the A/D type mask or the saved access bits of access-tracked
Expand Down Expand Up @@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,

static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
return (spte & shadow_host_writable_mask) &&
(spte & shadow_mmu_writable_mask);
if (spte & shadow_mmu_writable_mask) {
WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
return true;
}

WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
return false;
}

static inline u64 get_mmio_spte_generation(u64 spte)
Expand Down
6 changes: 3 additions & 3 deletions arch/x86/kvm/mmu/tdp_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;

if (!is_writable_pte(iter.old_spte))
break;

new_spte = iter.old_spte &
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);

if (new_spte == iter.old_spte)
break;

tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
Expand Down
Loading

0 comments on commit 636b528

Please sign in to comment.