Skip to content

Commit

Permalink
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Browse files Browse the repository at this point in the history
Pull KVM updates from Paolo Bonzini:
 "ARM:
   - Page ownership tracking between host EL1 and EL2
   - Rely on userspace page tables to create large stage-2 mappings
   - Fix incompatibility between pKVM and kmemleak
   - Fix the PMU reset state, and improve the performance of the virtual
     PMU
   - Move over to the generic KVM entry code
   - Address PSCI reset issues w.r.t. save/restore
   - Preliminary rework for the upcoming pKVM fixed feature
   - A bunch of MM cleanups
   - a vGIC fix for timer spurious interrupts
   - Various cleanups

  s390:
   - enable interpretation of specification exceptions
   - fix a vcpu_idx vs vcpu_id mixup

  x86:
   - fast (lockless) page fault support for the new MMU
   - new MMU now the default
   - increased maximum allowed VCPU count
   - allow inhibit IRQs on KVM_RUN while debugging guests
   - let Hyper-V-enabled guests run with virtualized LAPIC as long as
     they do not enable the Hyper-V "AutoEOI" feature
   - fixes and optimizations for the toggling of AMD AVIC (virtualized
     LAPIC)
   - tuning for the case when two-dimensional paging (EPT/NPT) is
     disabled
   - bugfixes and cleanups, especially with respect to vCPU reset and
     choosing a paging mode based on CR0/CR4/EFER
   - support for 5-level page table on AMD processors

  Generic:
   - MMU notifier invalidation callbacks do not take mmu_lock unless
     necessary
   - improved caching of LRU kvm_memory_slot
   - support for histogram statistics
   - add statistics for halt polling and remote TLB flush requests"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (210 commits)
  KVM: Drop unused kvm_dirty_gfn_invalid()
  KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted
  KVM: MMU: mark role_regs and role accessors as maybe unused
  KVM: MIPS: Remove a "set but not used" variable
  x86/kvm: Don't enable IRQ when IRQ enabled in kvm_wait
  KVM: stats: Add VM stat for remote tlb flush requests
  KVM: Remove unnecessary export of kvm_{inc,dec}_notifier_count()
  KVM: x86/mmu: Move lpage_disallowed_link further "down" in kvm_mmu_page
  KVM: x86/mmu: Relocate kvm_mmu_page.tdp_mmu_page for better cache locality
  Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()"
  KVM: x86/mmu: Remove unused field mmio_cached in struct kvm_mmu_page
  kvm: x86: Increase KVM_SOFT_MAX_VCPUS to 710
  kvm: x86: Increase MAX_VCPUS to 1024
  kvm: x86: Set KVM_MAX_VCPU_ID to 4*KVM_MAX_VCPUS
  KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation
  KVM: x86/mmu: Don't freak out if pml5_root is NULL on 4-level host
  KVM: s390: index kvm->arch.idle_mask by vcpu_idx
  KVM: s390: Enable specification exception interpretation
  KVM: arm64: Trim guest debug exception handling
  KVM: SVM: Add 5-level page table support for SVM
  ...
  • Loading branch information
torvalds committed Sep 7, 2021
2 parents a2b2823 + 109bbba commit 192ad3c
Show file tree
Hide file tree
Showing 120 changed files with 2,891 additions and 1,605 deletions.
36 changes: 28 additions & 8 deletions Documentation/virt/kvm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3357,6 +3357,7 @@ flags which can include the following:
- KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
- KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
- KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390]
- KVM_GUESTDBG_BLOCKIRQ: avoid injecting interrupts/NMI/SMI [x86]

For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
are enabled in memory so we need to ensure breakpoint exceptions are
Expand Down Expand Up @@ -5208,25 +5209,30 @@ by a string of size ``name_size``.
#define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_LINEAR_HIST (0x3 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_LOG_HIST (0x4 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_LOG_HIST

#define KVM_STATS_UNIT_SHIFT 4
#define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_NONE (0x0 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES

#define KVM_STATS_BASE_SHIFT 8
#define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT)
#define KVM_STATS_BASE_POW10 (0x0 << KVM_STATS_BASE_SHIFT)
#define KVM_STATS_BASE_POW2 (0x1 << KVM_STATS_BASE_SHIFT)
#define KVM_STATS_BASE_MAX KVM_STATS_BASE_POW2

struct kvm_stats_desc {
__u32 flags;
__s16 exponent;
__u16 size;
__u32 offset;
__u32 unused;
__u32 bucket_size;
char name[];
};

Expand All @@ -5237,21 +5243,35 @@ The following flags are supported:
Bits 0-3 of ``flags`` encode the type:

* ``KVM_STATS_TYPE_CUMULATIVE``
The statistics data is cumulative. The value of data can only be increased.
The statistics reports a cumulative count. The value of data can only be increased.
Most of the counters used in KVM are of this type.
The corresponding ``size`` field for this type is always 1.
All cumulative statistics data are read/write.
* ``KVM_STATS_TYPE_INSTANT``
The statistics data is instantaneous. Its value can be increased or
The statistics reports an instantaneous value. Its value can be increased or
decreased. This type is usually used as a measurement of some resources,
like the number of dirty pages, the number of large pages, etc.
All instant statistics are read only.
The corresponding ``size`` field for this type is always 1.
* ``KVM_STATS_TYPE_PEAK``
The statistics data is peak. The value of data can only be increased, and
represents a peak value for a measurement, for example the maximum number
The statistics data reports a peak value, for example the maximum number
of items in a hash table bucket, the longest time waited and so on.
The value of data can only be increased.
The corresponding ``size`` field for this type is always 1.
* ``KVM_STATS_TYPE_LINEAR_HIST``
The statistic is reported as a linear histogram. The number of
buckets is specified by the ``size`` field. The size of buckets is specified
by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``)
is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last
bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity
value.) The bucket value indicates how many samples fell in the bucket's range.
* ``KVM_STATS_TYPE_LOG_HIST``
The statistic is reported as a logarithmic histogram. The number of
buckets is specified by the ``size`` field. The range of the first bucket is
[0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF).
Otherwise, The Nth bucket (1 < N < ``size``) covers
[pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell
in the bucket's range.

Bits 4-7 of ``flags`` encode the unit:

Expand Down Expand Up @@ -5286,9 +5306,9 @@ unsigned 64bit data.
The ``offset`` field is the offset from the start of Data Block to the start of
the corresponding statistics data.

The ``unused`` field is reserved for future support for other types of
statistics data, like log/linear histogram. Its value is always 0 for the types
defined above.
The ``bucket_size`` field is used as a parameter for histogram statistics data.
It is only used by linear histogram statistics data, specifying the size of a
bucket.

The ``name`` field is the name string of the statistics data. The name string
starts at the end of ``struct kvm_stats_desc``. The maximum length including
Expand Down
6 changes: 6 additions & 0 deletions Documentation/virt/kvm/locking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows:
can be taken inside a kvm->srcu read-side critical section,
while kvm->slots_lock cannot.

- kvm->mn_active_invalidate_count ensures that pairs of
invalidate_range_start() and invalidate_range_end() callbacks
use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock
are taken on the waiting side in install_new_memslots, so MMU notifiers
must not take either kvm->slots_lock or kvm->slots_arch_lock.

On x86:

- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
Expand Down
18 changes: 9 additions & 9 deletions arch/arm64/include/asm/cpufeature.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0)
{
u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT);

return val == ID_AA64PFR0_EL1_32BIT_64BIT;
return val == ID_AA64PFR0_ELx_32BIT_64BIT;
}

static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
{
u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);

return val == ID_AA64PFR0_EL0_32BIT_64BIT;
return val == ID_AA64PFR0_ELx_32BIT_64BIT;
}

static inline bool id_aa64pfr0_sve(u64 pfr0)
Expand Down Expand Up @@ -784,13 +784,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
{
switch (parange) {
case 0: return 32;
case 1: return 36;
case 2: return 40;
case 3: return 42;
case 4: return 44;
case 5: return 48;
case 6: return 52;
case ID_AA64MMFR0_PARANGE_32: return 32;
case ID_AA64MMFR0_PARANGE_36: return 36;
case ID_AA64MMFR0_PARANGE_40: return 40;
case ID_AA64MMFR0_PARANGE_42: return 42;
case ID_AA64MMFR0_PARANGE_44: return 44;
case ID_AA64MMFR0_PARANGE_48: return 48;
case ID_AA64MMFR0_PARANGE_52: return 52;
/*
* A future PE could use a value unknown to the kernel.
* However, by the "D10.1.4 Principles of the ID scheme
Expand Down
54 changes: 38 additions & 16 deletions arch/arm64/include/asm/kvm_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
#include <asm/types.h>

/* Hyp Configuration Register (HCR) bits */

#define HCR_TID5 (UL(1) << 58)
#define HCR_DCT (UL(1) << 57)
#define HCR_ATA_SHIFT 56
#define HCR_ATA (UL(1) << HCR_ATA_SHIFT)
#define HCR_AMVOFFEN (UL(1) << 51)
#define HCR_FIEN (UL(1) << 47)
#define HCR_FWB (UL(1) << 46)
#define HCR_API (UL(1) << 41)
#define HCR_APK (UL(1) << 40)
Expand All @@ -32,9 +37,9 @@
#define HCR_TVM (UL(1) << 26)
#define HCR_TTLB (UL(1) << 25)
#define HCR_TPU (UL(1) << 24)
#define HCR_TPC (UL(1) << 23)
#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */
#define HCR_TSW (UL(1) << 22)
#define HCR_TAC (UL(1) << 21)
#define HCR_TACR (UL(1) << 21)
#define HCR_TIDCP (UL(1) << 20)
#define HCR_TSC (UL(1) << 19)
#define HCR_TID3 (UL(1) << 18)
Expand All @@ -56,12 +61,13 @@
#define HCR_PTW (UL(1) << 2)
#define HCR_SWIO (UL(1) << 1)
#define HCR_VM (UL(1) << 0)
#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39))

/*
* The bits we set in HCR:
* TLOR: Trap LORegion register accesses
* RW: 64bit by default, can be overridden for 32bit VMs
* TAC: Trap ACTLR
* TACR: Trap ACTLR
* TSC: Trap SMC
* TSW: Trap cache operations by set/way
* TWE: Trap WFE
Expand All @@ -76,7 +82,7 @@
* PTW: Take a stage2 fault if a stage1 walk steps in device memory
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
HCR_BSU_IS | HCR_FB | HCR_TAC | \
HCR_BSU_IS | HCR_FB | HCR_TACR | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
HCR_FMO | HCR_IMO | HCR_PTW )
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
Expand Down Expand Up @@ -275,24 +281,40 @@
#define CPTR_EL2_TTA (1 << 20)
#define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT)
#define CPTR_EL2_TZ (1 << 8)
#define CPTR_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 */
#define CPTR_EL2_DEFAULT CPTR_EL2_RES1
#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */
#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1
#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \
GENMASK(29, 21) | \
GENMASK(19, 14) | \
BIT(11))

/* Hyp Debug Configuration Register bits */
#define MDCR_EL2_E2TB_MASK (UL(0x3))
#define MDCR_EL2_E2TB_SHIFT (UL(24))
#define MDCR_EL2_TTRF (1 << 19)
#define MDCR_EL2_TPMS (1 << 14)
#define MDCR_EL2_HPMFZS (UL(1) << 36)
#define MDCR_EL2_HPMFZO (UL(1) << 29)
#define MDCR_EL2_MTPME (UL(1) << 28)
#define MDCR_EL2_TDCC (UL(1) << 27)
#define MDCR_EL2_HCCD (UL(1) << 23)
#define MDCR_EL2_TTRF (UL(1) << 19)
#define MDCR_EL2_HPMD (UL(1) << 17)
#define MDCR_EL2_TPMS (UL(1) << 14)
#define MDCR_EL2_E2PB_MASK (UL(0x3))
#define MDCR_EL2_E2PB_SHIFT (UL(12))
#define MDCR_EL2_TDRA (1 << 11)
#define MDCR_EL2_TDOSA (1 << 10)
#define MDCR_EL2_TDA (1 << 9)
#define MDCR_EL2_TDE (1 << 8)
#define MDCR_EL2_HPME (1 << 7)
#define MDCR_EL2_TPM (1 << 6)
#define MDCR_EL2_TPMCR (1 << 5)
#define MDCR_EL2_HPMN_MASK (0x1F)
#define MDCR_EL2_TDRA (UL(1) << 11)
#define MDCR_EL2_TDOSA (UL(1) << 10)
#define MDCR_EL2_TDA (UL(1) << 9)
#define MDCR_EL2_TDE (UL(1) << 8)
#define MDCR_EL2_HPME (UL(1) << 7)
#define MDCR_EL2_TPM (UL(1) << 6)
#define MDCR_EL2_TPMCR (UL(1) << 5)
#define MDCR_EL2_HPMN_MASK (UL(0x1F))
#define MDCR_EL2_RES0 (GENMASK(63, 37) | \
GENMASK(35, 30) | \
GENMASK(25, 24) | \
GENMASK(22, 20) | \
BIT(18) | \
GENMASK(16, 15))

/* For compatibility with fault code shared with 32-bit */
#define FSC_FAULT ESR_ELx_FSC_FAULT
Expand Down
7 changes: 3 additions & 4 deletions arch/arm64/include/asm/kvm_asm.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,11 @@
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15
#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16
#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16
#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17
#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18
#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19
#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20
#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21
#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20

#ifndef __ASSEMBLY__

Expand Down Expand Up @@ -210,7 +209,7 @@ extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
extern void __vgic_v3_init_lrs(void);

extern u32 __kvm_get_mdcr_el2(void);
extern u64 __kvm_get_mdcr_el2(void);

#define __KVM_EXTABLE(from, to) \
" .pushsection __kvm_ex_table, \"a\"\n" \
Expand Down
17 changes: 13 additions & 4 deletions arch/arm64/include/asm/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
extern unsigned int kvm_sve_max_vl;
int kvm_arm_init_sve(void);

int __attribute_const__ kvm_target_cpu(void);
u32 __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);

Expand Down Expand Up @@ -185,7 +185,6 @@ enum vcpu_sysreg {
PMCNTENSET_EL0, /* Count Enable Set Register */
PMINTENSET_EL1, /* Interrupt Enable Set Register */
PMOVSSET_EL0, /* Overflow Flag Status Set Register */
PMSWINC_EL0, /* Software Increment Register */
PMUSERENR_EL0, /* User Enable Register */

/* Pointer Authentication Registers in a strict increasing order. */
Expand Down Expand Up @@ -287,9 +286,13 @@ struct kvm_vcpu_arch {
/* Stage 2 paging state used by the hardware on next switch */
struct kvm_s2_mmu *hw_mmu;

/* HYP configuration */
/* Values of trap registers for the guest. */
u64 hcr_el2;
u32 mdcr_el2;
u64 mdcr_el2;
u64 cptr_el2;

/* Values of trap registers for the host before guest entry. */
u64 mdcr_el2_host;

/* Exception Information */
struct kvm_vcpu_fault_info fault;
Expand Down Expand Up @@ -576,6 +579,7 @@ struct kvm_vcpu_stat {
u64 wfi_exit_stat;
u64 mmio_exit_user;
u64 mmio_exit_kernel;
u64 signal_exits;
u64 exits;
};

Expand Down Expand Up @@ -771,6 +775,11 @@ void kvm_arch_free_vm(struct kvm *kvm);

int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);

static inline bool kvm_vm_is_protected(struct kvm *kvm)
{
return false;
}

int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);

Expand Down
2 changes: 1 addition & 1 deletion arch/arm64/include/asm/kvm_hyp.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr);

#ifndef __KVM_NVHE_HYPERVISOR__
void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
void deactivate_traps_vhe_put(void);
void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu);
#endif

u64 __guest_enter(struct kvm_vcpu *vcpu);
Expand Down
17 changes: 9 additions & 8 deletions arch/arm64/include/asm/kvm_mmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,24 +252,30 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,

#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)

/*
* When this is (directly or indirectly) used on the TLB invalidation
* path, we rely on a previously issued DSB so that page table updates
* and VMID reads are correctly ordered.
*/
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
{
struct kvm_vmid *vmid = &mmu->vmid;
u64 vmid_field, baddr;
u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0;

baddr = mmu->pgd_phys;
vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT;
vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT;
return kvm_phys_to_vttbr(baddr) | vmid_field | cnp;
}

/*
* Must be called from hyp code running at EL2 with an updated VTTBR
* and interrupts disabled.
*/
static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr)
static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
struct kvm_arch *arch)
{
write_sysreg(vtcr, vtcr_el2);
write_sysreg(arch->vtcr, vtcr_el2);
write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);

/*
Expand All @@ -280,11 +286,6 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
}

static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
{
__load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr);
}

static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
{
return container_of(mmu->arch, struct kvm, arch);
Expand Down
Loading

0 comments on commit 192ad3c

Please sign in to comment.