Skip to content

Commit

Permalink
Merge tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/…
Browse files Browse the repository at this point in the history
…linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:
 "A bunch of things accumulated for x86 in the last two weeks:

   - Fix guest vtime accounting so that ticks happening while the guest
     is running can also be accounted to it. Along with a consolidation
     to the guest-specific context tracking helpers.

   - Provide for the host NMI handler running after a VMX VMEXIT to be
     able to run on the kernel stack correctly.

   - Initialize MSR_TSC_AUX when RDPID is supported and not RDTSCP (virt
     relevant - real hw supports both)

   - A code generation improvement to TASK_SIZE_MAX through the use of
     alternatives

   - The usual misc and related cleanups and improvements"

* tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  KVM: x86: Consolidate guest enter/exit logic to common helpers
  context_tracking: KVM: Move guest enter/exit wrappers to KVM's domain
  context_tracking: Consolidate guest enter/exit wrappers
  sched/vtime: Move guest enter/exit vtime accounting to vtime.h
  sched/vtime: Move vtime accounting external declarations above inlines
  KVM: x86: Defer vtime accounting 'til after IRQ handling
  context_tracking: Move guest exit vtime accounting to separate helpers
  context_tracking: Move guest exit context tracking to separate helpers
  KVM/VMX: Invoke NMI non-IST entry instead of IST entry
  x86/cpu: Remove write_tsc() and write_rdtscp_aux() wrappers
  x86/cpu: Initialize MSR_TSC_AUX if RDTSCP *or* RDPID is supported
  x86/resctrl: Fix init const confusion
  x86: Delete UD0, UD1 traces
  x86/smpboot: Remove duplicate includes
  x86/cpu: Use alternative to generate the TASK_SIZE_MAX constant
  • Loading branch information
torvalds committed May 9, 2021
2 parents b741596 + bc908e0 commit dd3e401
Show file tree
Hide file tree
Showing 16 changed files with 263 additions and 233 deletions.
9 changes: 0 additions & 9 deletions arch/x86/include/asm/bug.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,9 @@

/*
* Despite that some emulators terminate on UD2, we use it for WARN().
*
* Since various instruction decoders/specs disagree on the encoding of
* UD0/UD1.
*/

#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
#define ASM_UD2 ".byte 0x0f, 0x0b"

#define INSN_UD0 0xff0f
#define INSN_UD2 0x0b0f

#define LEN_UD2 2

#ifdef CONFIG_GENERIC_BUG
Expand Down
15 changes: 15 additions & 0 deletions arch/x86/include/asm/idtentry.h
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
#endif

/* NMI */

#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
/*
* Special NOIST entry point for VMX which invokes this on the kernel
* stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
* 'executing' marker.
*
* On 32bit this just uses the regular NMI entry point because 32-bit does
* not have ISTs.
*/
DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
#else
#define asm_exc_nmi_noist asm_exc_nmi
#endif

DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
Expand Down
4 changes: 0 additions & 4 deletions arch/x86/include/asm/msr.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
}

#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))

#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)

struct msr *msrs_alloc(void);
void msrs_free(struct msr *msrs);
int msr_set_bit(u32 msr, u8 bit);
Expand Down
33 changes: 33 additions & 0 deletions arch/x86/include/asm/page_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,39 @@ static inline void clear_page(void *page)

void copy_page(void *to, void *from);

#ifdef CONFIG_X86_5LEVEL
/*
* User space process size. This is the first address outside the user range.
* There are a few constraints that determine this:
*
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
* We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
static inline unsigned long task_size_max(void)
{
unsigned long ret;

alternative_io("movq %[small],%0","movq %[large],%0",
X86_FEATURE_LA57,
"=r" (ret),
[small] "i" ((1ul << 47)-PAGE_SIZE),
[large] "i" ((1ul << 56)-PAGE_SIZE));

return ret;
}
#endif /* CONFIG_X86_5LEVEL */

#endif /* !__ASSEMBLY__ */

#ifdef CONFIG_X86_VSYSCALL_EMULATION
Expand Down
23 changes: 3 additions & 20 deletions arch/x86/include/asm/page_64_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,30 +55,13 @@

#ifdef CONFIG_X86_5LEVEL
#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
/* See task_size_max() in <asm/page_64.h> */
#else
#define __VIRTUAL_MASK_SHIFT 47
#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
#endif

/*
* User space process size. This is the first address outside the user range.
* There are a few constraints that determine this:
*
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
* We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)

#define TASK_SIZE_MAX task_size_max()
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)

/* This decides where the kernel will search for a free chunk of vm
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kernel/cpu/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1851,8 +1851,8 @@ static inline void setup_getcpu(int cpu)
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };

if (boot_cpu_has(X86_FEATURE_RDTSCP))
write_rdtscp_aux(cpudata);
if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
wrmsr(MSR_TSC_AUX, cpudata, 0);

/* Store CPU and node number in limit. */
d.limit0 = cpudata;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/cpu/resctrl/monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold;
static const struct mbm_correction_factor_table {
u32 rmidthreshold;
u64 cf;
} mbm_cf_table[] __initdata = {
} mbm_cf_table[] __initconst = {
{7, CF(1.000000)},
{15, CF(1.000000)},
{15, CF(0.969650)},
Expand Down
10 changes: 10 additions & 0 deletions arch/x86/kernel/nmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,16 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
mds_user_clear_cpu_buffers();
}

#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
DEFINE_IDTENTRY_RAW(exc_nmi_noist)
{
exc_nmi(regs);
}
#endif
#if IS_MODULE(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
#endif

void stop_nmi(void)
{
ignore_nmis++;
Expand Down
3 changes: 0 additions & 3 deletions arch/x86/kernel/smpboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -1865,9 +1865,6 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}

#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>

#define X86_MATCH(model) \
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
Expand Down
39 changes: 2 additions & 37 deletions arch/x86/kvm/svm/svm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3710,25 +3710,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long vmcb_pa = svm->current_vmcb->pa;

/*
* VMENTER enables interrupts (host state), but the kernel state is
* interrupts disabled when this is invoked. Also tell RCU about
* it. This is the same logic as for exit_to_user_mode().
*
* This ensures that e.g. latency analysis on the host observes
* guest mode as interrupt enabled.
*
* guest_enter_irqoff() informs context tracking about the
* transition to guest mode and if enabled adjusts RCU state
* accordingly.
*/
instrumentation_begin();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();

guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
kvm_guest_enter_irqoff();

if (sev_es_guest(vcpu->kvm)) {
__svm_sev_es_vcpu_run(vmcb_pa);
Expand All @@ -3748,24 +3730,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
vmload(__sme_page_pa(sd->save_area));
}

/*
* VMEXIT disables interrupts (host state), but tracing and lockdep
* have them in state 'on' as recorded before entering guest mode.
* Same as enter_from_user_mode().
*
* guest_exit_irqoff() restores host context and reinstates RCU if
* enabled and required.
*
* This needs to be done before the below as native_read_msr()
* contains a tracepoint and x86_spec_ctrl_restore_host() calls
* into world and some more.
*/
lockdep_hardirqs_off(CALLER_ADDR0);
guest_exit_irqoff();

instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
kvm_guest_exit_irqoff();
}

static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
Expand Down
55 changes: 11 additions & 44 deletions arch/x86/kvm/vmx/vmx.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
Expand Down Expand Up @@ -6415,18 +6416,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)

void vmx_do_interrupt_nmi_irqoff(unsigned long entry);

static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
unsigned long entry)
{
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
gate_desc *desc = (gate_desc *)host_idt_base + vector;

kvm_before_interrupt(vcpu);
vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}

static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);

/* if exit due to PF check for async PF */
Expand All @@ -6437,18 +6437,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
else if (is_nmi(intr_info))
handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}

static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
gate_desc *desc = (gate_desc *)host_idt_base + vector;

if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;

handle_interrupt_nmi_irqoff(vcpu, intr_info);
handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
}

static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
Expand Down Expand Up @@ -6662,25 +6664,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
/*
* VMENTER enables interrupts (host state), but the kernel state is
* interrupts disabled when this is invoked. Also tell RCU about
* it. This is the same logic as for exit_to_user_mode().
*
* This ensures that e.g. latency analysis on the host observes
* guest mode as interrupt enabled.
*
* guest_enter_irqoff() informs context tracking about the
* transition to guest mode and if enabled adjusts RCU state
* accordingly.
*/
instrumentation_begin();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();

guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
kvm_guest_enter_irqoff();

/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
Expand All @@ -6696,24 +6680,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,

vcpu->arch.cr2 = native_read_cr2();

/*
* VMEXIT disables interrupts (host state), but tracing and lockdep
* have them in state 'on' as recorded before entering guest mode.
* Same as enter_from_user_mode().
*
* guest_exit_irqoff() restores host context and reinstates RCU if
* enabled and required.
*
* This needs to be done before the below as native_read_msr()
* contains a tracepoint and x86_spec_ctrl_restore_host() calls
* into world and some more.
*/
lockdep_hardirqs_off(CALLER_ADDR0);
guest_exit_irqoff();

instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
kvm_guest_exit_irqoff();
}

static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
Expand Down
9 changes: 9 additions & 0 deletions arch/x86/kvm/x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -9315,6 +9315,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_disable();
kvm_after_interrupt(vcpu);

/*
* Wait until after servicing IRQs to account guest time so that any
* ticks that occurred while running the guest are properly accounted
* to the guest. Waiting until IRQs are enabled degrades the accuracy
* of accounting via context tracking, but the loss of accuracy is
* acceptable for all known use cases.
*/
vtime_account_guest_exit();

if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
if (delta != S64_MIN) {
Expand Down
Loading

0 comments on commit dd3e401

Please sign in to comment.