Skip to content

Commit

Permalink
Merge tag 'kvm-x86-pmu-6.9' of https://github.com/kvm-x86/linux into …
Browse files Browse the repository at this point in the history
…HEAD

KVM x86 PMU changes for 6.9:

 - Fix several bugs where KVM speciously prevents the guest from utilizing
   fixed counters and architectural event encodings based on whether or not
   guest CPUID reports support for the _architectural_ encoding.

 - Fix a variety of bugs in KVM's emulation of RDPMC, e.g. for "fast" reads,
   priority of VMX interception vs #GP, PMC types in architectural PMUs, etc.

 - Add a selftest to verify KVM correctly emulates RDMPC, counter availability,
   and a variety of other PMC-related behaviors that depend on guest CPUID,
   i.e. are difficult to validate via KVM-Unit-Tests.

 - Zero out PMU metadata on AMD if the virtual PMU is disabled to avoid wasting
   cycles, e.g. when checking if a PMC event needs to be synthesized when
   skipping an instruction.

 - Optimize triggering of emulated events, e.g. for "count instructions" events
   when skipping an instruction, which yields a ~10% performance improvement in
   VM-Exit microbenchmarks when a vPMU is exposed to the guest.

 - Tighten the check for "PMI in guest" to reduce false positives if an NMI
   arrives in the host while KVM is handling an IRQ VM-Exit.
  • Loading branch information
bonzini committed Mar 11, 2024
2 parents b00471a + 812d432 commit e9025cd
Show file tree
Hide file tree
Showing 23 changed files with 1,262 additions and 398 deletions.
4 changes: 1 addition & 3 deletions arch/x86/include/asm/kvm-x86-pmu-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@ BUILD_BUG_ON(1)
* a NULL definition, for example if "static_call_cond()" will be used
* at the call sites.
*/
KVM_X86_PMU_OP(hw_event_available)
KVM_X86_PMU_OP(pmc_idx_to_pmc)
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early)
KVM_X86_PMU_OP(is_valid_msr)
KVM_X86_PMU_OP(get_msr)
KVM_X86_PMU_OP(set_msr)
Expand Down
11 changes: 10 additions & 1 deletion arch/x86/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ struct kvm_pmc {
#define KVM_PMC_MAX_FIXED 3
#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6

struct kvm_pmu {
u8 version;
unsigned nr_arch_gp_counters;
Expand Down Expand Up @@ -1889,8 +1890,16 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
}
#endif /* CONFIG_HYPERV */

enum kvm_intr_type {
/* Values are arbitrary, but must be non-zero. */
KVM_HANDLING_IRQ = 1,
KVM_HANDLING_NMI,
};

/* Enable perf NMI and timer modes to work, and minimise false positives. */
#define kvm_arch_pmi_in_guest(vcpu) \
((vcpu) && (vcpu)->arch.handling_intr_from_guest)
((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
(!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))

void __init kvm_mmu_x86_module_init(void);
int kvm_mmu_vendor_module_init(void);
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -3955,7 +3955,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
* protected mode.
*/
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
ctxt->ops->check_pmc(ctxt, rcx))
ctxt->ops->check_rdpmc_early(ctxt, rcx))
return emulate_gp(ctxt, 0);

return X86EMUL_CONTINUE;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/kvm_emulate.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ struct x86_emulate_ops {
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
void (*halt)(struct x86_emulate_ctxt *ctxt);
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
Expand Down
163 changes: 103 additions & 60 deletions arch/x86/kvm/pmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_GPL(kvm_pmu_cap);

struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);

/* Precise Distribution of Instructions Retired (PDIR) */
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
Expand Down Expand Up @@ -67,7 +70,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
* all perf counters (both gp and fixed). The mapping relationship
* between pmc and perf counters is as the following:
* * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
* [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
* [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
*/
Expand Down Expand Up @@ -411,7 +414,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
int idx)
{
int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;

if (filter->action == KVM_PMU_EVENT_DENY &&
test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
Expand Down Expand Up @@ -441,11 +444,10 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
{
return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
static_call(kvm_x86_pmu_hw_event_available)(pmc) &&
check_pmu_event_filter(pmc);
}

static void reprogram_counter(struct kvm_pmc *pmc)
static int reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
Expand All @@ -456,7 +458,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
emulate_overflow = pmc_pause_counter(pmc);

if (!pmc_event_is_allowed(pmc))
goto reprogram_complete;
return 0;

if (emulate_overflow)
__kvm_perf_overflow(pmc, false);
Expand All @@ -466,7 +468,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)

if (pmc_is_fixed(pmc)) {
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
pmc->idx - INTEL_PMC_IDX_FIXED);
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
if (fixed_ctr_ctrl & 0x1)
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
if (fixed_ctr_ctrl & 0x2)
Expand All @@ -477,43 +479,45 @@ static void reprogram_counter(struct kvm_pmc *pmc)
}

if (pmc->current_config == new_config && pmc_resume_counter(pmc))
goto reprogram_complete;
return 0;

pmc_release_perf_event(pmc);

pmc->current_config = new_config;

/*
* If reprogramming fails, e.g. due to contention, leave the counter's
* regprogram bit set, i.e. opportunistically try again on the next PMU
* refresh. Don't make a new request as doing so can stall the guest
* if reprogramming repeatedly fails.
*/
if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
(eventsel & pmu->raw_event_mask),
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
eventsel & ARCH_PERFMON_EVENTSEL_INT))
return;

reprogram_complete:
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
(eventsel & pmu->raw_event_mask),
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
eventsel & ARCH_PERFMON_EVENTSEL_INT);
}

void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
int bit;

for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);

if (unlikely(!pmc)) {
clear_bit(bit, pmu->reprogram_pmi);
continue;
}
/*
* The reprogramming bitmap can be written asynchronously by something
* other than the task that holds vcpu->mutex, take care to clear only
* the bits that will actually processed.
*/
BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);

reprogram_counter(pmc);
kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
/*
* If reprogramming fails, e.g. due to contention, re-set the
* regprogram bit set, i.e. opportunistically try again on the
* next PMU refresh. Don't make a new request as doing so can
* stall the guest if reprogramming repeatedly fails.
*/
if (reprogram_counter(pmc))
set_bit(pmc->idx, pmu->reprogram_pmi);
}

/*
Expand All @@ -525,10 +529,20 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
kvm_pmu_cleanup(vcpu);
}

/* check if idx is a valid index to access PMU */
bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
{
return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
/*
* On Intel, VMX interception has priority over RDPMC exceptions that
* aren't already handled by the emulator, i.e. there are no additional
* check needed for Intel PMUs.
*
* On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
* i.e. an invalid PMC results in a #GP, not #VMEXIT.
*/
if (!kvm_pmu_ops.check_rdpmc_early)
return 0;

return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
}

bool is_vmware_backdoor_pmc(u32 pmc_idx)
Expand Down Expand Up @@ -567,10 +581,9 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)

int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
bool fast_mode = idx & (1u << 31);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
u64 mask = fast_mode ? ~0u : ~0ull;
u64 mask = ~0ull;

if (!pmu->version)
return 1;
Expand Down Expand Up @@ -716,11 +729,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)

bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);

for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
if (!pmc)
continue;

kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
pmc_stop_counter(pmc);
pmc->counter = 0;
pmc->emulated_counter = 0;
Expand All @@ -741,6 +750,8 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);

if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
return;

Expand All @@ -750,8 +761,22 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
*/
kvm_pmu_reset(vcpu);

bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
static_call(kvm_x86_pmu_refresh)(vcpu);
pmu->version = 0;
pmu->nr_arch_gp_counters = 0;
pmu->nr_arch_fixed_counters = 0;
pmu->counter_bitmask[KVM_PMC_GP] = 0;
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
pmu->global_ctrl_mask = ~0ull;
pmu->global_status_mask = ~0ull;
pmu->fixed_ctr_ctrl_mask = ~0ull;
pmu->pebs_enable_mask = ~0ull;
pmu->pebs_data_cfg_mask = ~0ull;
bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);

if (vcpu->kvm->arch.enable_pmu)
static_call(kvm_x86_pmu_refresh)(vcpu);
}

void kvm_pmu_init(struct kvm_vcpu *vcpu)
Expand All @@ -776,10 +801,8 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
pmu->pmc_in_use, X86_PMC_IDX_MAX);

for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);

if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
kvm_for_each_pmc(pmu, pmc, i, bitmask) {
if (pmc->perf_event && !pmc_speculative_in_use(pmc))
pmc_stop_counter(pmc);
}

Expand All @@ -799,13 +822,6 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
kvm_pmu_request_counter_reprogram(pmc);
}

static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
unsigned int perf_hw_id)
{
return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
AMD64_RAW_EVENT_MASK_NB);
}

static inline bool cpl_is_matched(struct kvm_pmc *pmc)
{
bool select_os, select_user;
Expand All @@ -817,29 +833,56 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
} else {
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
pmc->idx - INTEL_PMC_IDX_FIXED);
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
select_os = config & 0x1;
select_user = config & 0x2;
}

/*
* Skip the CPL lookup, which isn't free on Intel, if the result will
* be the same regardless of the CPL.
*/
if (select_os == select_user)
return select_os;

return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
}

void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
{
DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
int i;

for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);

if (!pmc || !pmc_event_is_allowed(pmc))
if (!kvm_pmu_has_perf_global_ctrl(pmu))
bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
(unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
return;

kvm_for_each_pmc(pmu, pmc, i, bitmap) {
/*
* Ignore checks for edge detect (all events currently emulated
* but KVM are always rising edges), pin control (unsupported
* by modern CPUs), and counter mask and its invert flag (KVM
* doesn't emulate multiple events in a single clock cycle).
*
* Note, the uppermost nibble of AMD's mask overlaps Intel's
* IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
* bits (bits 35:34). Checking the "in HLE/RTM transaction"
* flags is correct as the vCPU can't be in a transaction if
* KVM is emulating an instruction. Checking the reserved bits
* might be wrong if they are defined in the future, but so
* could ignoring them, so do the simple thing for now.
*/
if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
!pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
continue;

/* Ignore checks for edge detect, pin control, invert and CMASK bits */
if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
kvm_pmu_incr_counter(pmc);
kvm_pmu_incr_counter(pmc);
}
}
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
Expand Down
Loading

0 comments on commit e9025cd

Please sign in to comment.