From 7ad816762f9bf89e940e618ea40c43138b479e10 Mon Sep 17 00:00:00 2001 From: Petteri Aimonen Date: Tue, 16 Jun 2020 11:12:57 +0200 Subject: [PATCH 01/13] x86/fpu: Reset MXCSR to default in kernel_fpu_begin() Previously, kernel floating point code would run with the MXCSR control register value last set by userland code by the thread that was active on the CPU core just before kernel call. This could affect calculation results if rounding mode was changed, or a crash if a FPU/SIMD exception was unmasked. Restore MXCSR to the kernel's default value. [ bp: Carve out from a bigger patch by Petteri, add feature check, add FNINIT call too (amluto). ] Signed-off-by: Petteri Aimonen Signed-off-by: Borislav Petkov Link: https://bugzilla.kernel.org/show_bug.cgi?id=207979 Link: https://lkml.kernel.org/r/20200624114646.28953-2-bp@alien8.de --- arch/x86/include/asm/fpu/internal.h | 5 +++++ arch/x86/kernel/fpu/core.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42159f45bf9c42..845e7481ab776e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -623,6 +623,11 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) * MXCSR and XCR definitions: */ +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + extern unsigned int mxcsr_feature_mask; #define XCR_XFEATURE_ENABLED_MASK 0x00000000 diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 06c818967bb637..15247b96c6eaaa 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -101,6 +101,12 @@ void kernel_fpu_begin(void) copy_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); + + if (boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin); From 009bce1df0bb5eb970b9eb98d963861f7fe353c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Jun 2020 12:26:05 -0700 Subject: [PATCH 02/13] x86/split_lock: Don't write MSR_TEST_CTRL on CPUs that aren't whitelisted Choo! Choo! All aboard the Split Lock Express, with direct service to Wreckage! Skip split_lock_verify_msr() if the CPU isn't whitelisted as a possible SLD-enabled CPU model to avoid writing MSR_TEST_CTRL. MSR_TEST_CTRL exists, and is writable, on many generations of CPUs. Writing the MSR, even with '0', can result in bizarre, undocumented behavior. This fixes a crash on Haswell when resuming from suspend with a live KVM guest. Because APs use the standard SMP boot flow for resume, they will go through split_lock_init() and the subsequent RDMSR/WRMSR sequence, which runs even when sld_state==sld_off to ensure SLD is disabled. On Haswell (at least, my Haswell), writing MSR_TEST_CTRL with '0' will succeed and _may_ take the SMT _sibling_ out of VMX root mode. When KVM has an active guest, KVM performs VMXON as part of CPU onlining (see kvm_starting_cpu()). Because SMP boot is serialized, the resulting flow is effectively: on_each_ap_cpu() { WRMSR(MSR_TEST_CTRL, 0) VMXON } As a result, the WRMSR can disable VMX on a different CPU that has already done VMXON. This ultimately results in a #UD on VMPTRLD when KVM regains control and attempt run its vCPUs. The above voodoo was confirmed by reworking KVM's VMXON flow to write MSR_TEST_CTRL prior to VMXON, and to serialize the sequence as above. Further verification of the insanity was done by redoing VMXON on all APs after the initial WRMSR->VMXON sequence. The additional VMXON, which should VM-Fail, occasionally succeeded, and also eliminated the unexpected #UD on VMPTRLD. The damage done by writing MSR_TEST_CTRL doesn't appear to be limited to VMX, e.g. after suspend with an active KVM guest, subsequent reboots almost always hang (even when fudging VMXON), a #UD on a random Jcc was observed, suspend/resume stability is qualitatively poor, and so on and so forth. kernel BUG at arch/x86/kvm/x86.c:386! CPU: 1 PID: 2592 Comm: CPU 6/KVM Tainted: G D Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:kvm_spurious_fault+0xf/0x20 Call Trace: vmx_vcpu_load_vmcs+0x1fb/0x2b0 vmx_vcpu_load+0x3e/0x160 kvm_arch_vcpu_load+0x48/0x260 finish_task_switch+0x140/0x260 __schedule+0x460/0x720 _cond_resched+0x2d/0x40 kvm_arch_vcpu_ioctl_run+0x82e/0x1ca0 kvm_vcpu_ioctl+0x363/0x5c0 ksys_ioctl+0x88/0xa0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x4c/0x170 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: dbaba47085b0c ("x86/split_lock: Rework the initialization flow of split lock detection") Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200605192605.7439-1-sean.j.christopherson@intel.com --- arch/x86/kernel/cpu/intel.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c25a67a34bd3d9..0ab48f1cdf848f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -49,6 +49,13 @@ enum split_lock_detect_state { static enum split_lock_detect_state sld_state __ro_after_init = sld_off; static u64 msr_test_ctrl_cache __ro_after_init; +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -1071,7 +1078,8 @@ static void sld_update_msr(bool on) static void split_lock_init(void) { - split_lock_verify_msr(sld_state != sld_off); + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); } static void split_lock_warn(unsigned long ip) @@ -1177,5 +1185,6 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) return; } + cpu_model_supports_sld = true; split_lock_setup(); } From c9c26150e61de441ab58b25c1f64afc049ee0fee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:11 -0700 Subject: [PATCH 03/13] x86/entry: Assert that syscalls are on the right stack Now that the entry stack is a full page, it's too easy to regress the system call entry code and end up on the wrong stack without noticing. Assert that all system calls (SYSCALL64, SYSCALL32, SYSENTER, and INT80) are on the right stack and have pt_regs in the right place. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/52059e42bb0ab8551153d012d68f7be18d72ff8e.1593191971.git.luto@kernel.org --- arch/x86/entry/common.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bd3f14175193c3..ed8ccc82099547 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -45,6 +45,15 @@ #define CREATE_TRACE_POINTS #include +/* Check that the stack and regs on entry from user mode are sane. */ +static void check_user_regs(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + WARN_ON_ONCE(!on_thread_stack()); + WARN_ON_ONCE(regs != task_pt_regs(current)); + } +} + #ifdef CONFIG_CONTEXT_TRACKING /** * enter_from_user_mode - Establish state when coming from user mode @@ -127,9 +136,6 @@ static long syscall_trace_enter(struct pt_regs *regs) unsigned long ret = 0; u32 work; - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - BUG_ON(regs != task_pt_regs(current)); - work = READ_ONCE(ti->flags); if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { @@ -346,6 +352,8 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -409,6 +417,8 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs) /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -460,6 +470,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) vdso_image_32.sym_int80_landing_pad; bool success; + check_user_regs(regs); + /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. From d1721250f3ffed9afba3e1fb729947cec64c5a8a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:12 -0700 Subject: [PATCH 04/13] x86/entry: Move SYSENTER's regs->sp and regs->flags fixups into C The SYSENTER asm (32-bit and compat) contains fixups for regs->sp and regs->flags. Move the fixups into C and fix some comments while at it. This is a valid cleanup all by itself, and it also simplifies the subsequent patch that will fix Xen PV SYSENTER. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/fe62bef67eda7fac75b8f3dbafccf571dc4ece6b.1593191971.git.luto@kernel.org --- arch/x86/entry/common.c | 12 ++++++++++++ arch/x86/entry/entry_32.S | 5 ++--- arch/x86/entry/entry_64_compat.S | 11 +++++------ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ed8ccc82099547..f392a8bcd1c3e4 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -522,6 +522,18 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +{ + /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ + regs->sp = regs->bp; + + /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ + regs->flags |= X86_EFLAGS_IF; + + return do_fast_syscall_32(regs); +} #endif SYSCALL_DEFINE0(ni_syscall) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 024d7d276cd40b..2d0bd5d5f0328d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -933,9 +933,8 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ - pushl %ebp /* pt_regs->sp (stashed in bp) */ + pushl $0 /* pt_regs->sp (placeholder) */ pushfl /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ @@ -965,7 +964,7 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_flags_fixed: movl %esp, %eax - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 0f974ae01e62b8..7b9d8150f652b2 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -68,16 +68,15 @@ SYM_CODE_START(entry_SYSENTER_compat) /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp (stashed in bp) */ + pushq $0 /* pt_regs->sp = 0 (placeholder) */ /* * Push flags. This is nasty. First, interrupts are currently - * off, but we need pt_regs->flags to have IF set. Second, even - * if TF was set when SYSENTER started, it's clear by now. We fix - * that later using TIF_SINGLESTEP. + * off, but we need pt_regs->flags to have IF set. Second, if TS + * was set in usermode, it's still set, and we're singlestepping + * through this code. do_SYSENTER_32() will fix up IF. */ pushfq /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ @@ -135,7 +134,7 @@ SYM_CODE_START(entry_SYSENTER_compat) .Lsysenter_flags_fixed: movq %rsp, %rdi - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV From ffae641f57476369b4d503402b37ebe489d23395 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:13 -0700 Subject: [PATCH 05/13] x86/entry/64/compat: Fix Xen PV SYSENTER frame setup The SYSENTER frame setup was nonsense. It worked by accident because the normal code into which the Xen asm jumped (entry_SYSENTER_32/compat) threw away SP without touching the stack. entry_SYSENTER_compat was recently modified such that it relied on having a valid stack pointer, so now the Xen asm needs to invoke it with a valid stack. Fix it up like SYSCALL: use the Xen-provided frame and skip the bare metal prologue. Fixes: 1c3e5d3f60e2 ("x86/entry: Make entry_64_compat.S objtool clean") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Boris Ostrovsky Link: https://lkml.kernel.org/r/947880c41ade688ff4836f665d0c9fcaa9bd1201.1593191971.git.luto@kernel.org --- arch/x86/entry/entry_64_compat.S | 1 + arch/x86/xen/xen-asm_64.S | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 7b9d8150f652b2..381a6de7de9c6e 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -79,6 +79,7 @@ SYM_CODE_START(entry_SYSENTER_compat) pushfq /* pt_regs->flags (except IF = 0) */ pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ +SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 5d252aaeade8b3..e1e1c7eafa60b7 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -161,10 +161,22 @@ SYM_FUNC_END(xen_syscall32_target) /* 32-bit compat sysenter target */ SYM_FUNC_START(xen_sysenter_target) - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp - jmp entry_SYSENTER_compat + /* + * NB: Xen is polite and clears TF from EFLAGS for us. This means + * that we don't need to guard against single step exceptions here. + */ + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSENTER_compat_after_hwframe SYM_FUNC_END(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ From e4ef7de160c6b12639c4fc49bcacb25b860ac76d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:14 -0700 Subject: [PATCH 06/13] selftests/x86/syscall_nt: Add more flag combinations Add EFLAGS.AC to the mix. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/12924e2fe2c5826568b7fc9436d85ca7f5eb1743.1593191971.git.luto@kernel.org --- tools/testing/selftests/x86/syscall_nt.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index 02309a19504131..f060534b66a05b 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -73,6 +73,12 @@ int main(void) printf("[RUN]\tSet NT and issue a syscall\n"); do_it(X86_EFLAGS_NT); + printf("[RUN]\tSet AC and issue a syscall\n"); + do_it(X86_EFLAGS_AC); + + printf("[RUN]\tSet NT|AC and issue a syscall\n"); + do_it(X86_EFLAGS_NT | X86_EFLAGS_AC); + /* * Now try it again with TF set -- TF forces returns via IRET in all * cases except non-ptregs-using 64-bit full fast path syscalls. @@ -80,8 +86,17 @@ int main(void) sethandler(SIGTRAP, sigtrap, 0); + printf("[RUN]\tSet TF and issue a syscall\n"); + do_it(X86_EFLAGS_TF); + printf("[RUN]\tSet NT|TF and issue a syscall\n"); do_it(X86_EFLAGS_NT | X86_EFLAGS_TF); + printf("[RUN]\tSet AC|TF and issue a syscall\n"); + do_it(X86_EFLAGS_AC | X86_EFLAGS_TF); + + printf("[RUN]\tSet NT|AC|TF and issue a syscall\n"); + do_it(X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_TF); + return nerrs == 0 ? 0 : 1; } From a61fa2799ef9bf6c4f54cf7295036577cececc72 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:15 -0700 Subject: [PATCH 07/13] selftests/x86/syscall_nt: Clear weird flags after each test Clear the weird flags before logging to improve strace output -- logging results while, say, TF is set does no one any favors. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/907bfa5a42d4475b8245e18b67a04b13ca51ffdb.1593191971.git.luto@kernel.org --- tools/testing/selftests/x86/syscall_nt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index f060534b66a05b..5fc82b9cebedc9 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -59,6 +59,7 @@ static void do_it(unsigned long extraflags) set_eflags(get_eflags() | extraflags); syscall(SYS_getpid); flags = get_eflags(); + set_eflags(X86_EFLAGS_IF | X86_EFLAGS_FIXED); if ((flags & extraflags) == extraflags) { printf("[OK]\tThe syscall worked and flags are still set\n"); } else { From cced0b24bb545bfe74fea96de84adc23c0146b05 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:16 -0700 Subject: [PATCH 08/13] selftests/x86: Consolidate and fix get/set_eflags() helpers There are several copies of get_eflags() and set_eflags() and they all are buggy. Consolidate them and fix them. The fixes are: Add memory clobbers. These are probably unnecessary but they make sure that the compiler doesn't move something past one of these calls when it shouldn't. Respect the redzone on x86_64. There has no failure been observed related to this, but it's definitely a bug. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/982ce58ae8dea2f1e57093ee894760e35267e751.1593191971.git.luto@kernel.org --- tools/testing/selftests/x86/Makefile | 4 +- tools/testing/selftests/x86/helpers.h | 41 +++++++++++++++++++ .../selftests/x86/single_step_syscall.c | 17 +------- .../testing/selftests/x86/syscall_arg_fault.c | 21 +--------- tools/testing/selftests/x86/syscall_nt.c | 20 +-------- tools/testing/selftests/x86/test_vsyscall.c | 15 +------ tools/testing/selftests/x86/unwind_vdso.c | 23 +---------- 7 files changed, 51 insertions(+), 90 deletions(-) create mode 100644 tools/testing/selftests/x86/helpers.h diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5f16821c7f63a6..d2796ea98c5ac1 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -70,10 +70,10 @@ all_64: $(BINARIES_64) EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64) -$(BINARIES_32): $(OUTPUT)/%_32: %.c +$(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm -$(BINARIES_64): $(OUTPUT)/%_64: %.c +$(BINARIES_64): $(OUTPUT)/%_64: %.c helpers.h $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl # x86_64 users should be encouraged to install 32-bit libraries diff --git a/tools/testing/selftests/x86/helpers.h b/tools/testing/selftests/x86/helpers.h new file mode 100644 index 00000000000000..f5ff2a2615df0a --- /dev/null +++ b/tools/testing/selftests/x86/helpers.h @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __SELFTESTS_X86_HELPERS_H +#define __SELFTESTS_X86_HELPERS_H + +#include + +static inline unsigned long get_eflags(void) +{ + unsigned long eflags; + + asm volatile ( +#ifdef __x86_64__ + "subq $128, %%rsp\n\t" + "pushfq\n\t" + "popq %0\n\t" + "addq $128, %%rsp" +#else + "pushfl\n\t" + "popl %0" +#endif + : "=r" (eflags) :: "memory"); + + return eflags; +} + +static inline void set_eflags(unsigned long eflags) +{ + asm volatile ( +#ifdef __x86_64__ + "subq $128, %%rsp\n\t" + "pushq %0\n\t" + "popfq\n\t" + "addq $128, %%rsp" +#else + "pushl %0\n\t" + "popfl" +#endif + :: "r" (eflags) : "flags", "memory"); +} + +#endif /* __SELFTESTS_X86_HELPERS_H */ diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c index 1063328e275c90..120ac741fe4405 100644 --- a/tools/testing/selftests/x86/single_step_syscall.c +++ b/tools/testing/selftests/x86/single_step_syscall.c @@ -31,6 +31,8 @@ #include #include +#include "helpers.h" + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -67,21 +69,6 @@ static unsigned char altstack_data[SIGSTKSZ]; # define INT80_CLOBBERS #endif -static unsigned long get_eflags(void) -{ - unsigned long eflags; - asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); - return eflags; -} - -static void set_eflags(unsigned long eflags) -{ - asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH - : : "rm" (eflags) : "flags"); -} - -#define X86_EFLAGS_TF (1UL << 8) - static void sigtrap(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index bc0ecc2e862ef7..5b7abebbcbb9b8 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -15,30 +15,11 @@ #include #include -#ifdef __x86_64__ -# define WIDTH "q" -#else -# define WIDTH "l" -#endif +#include "helpers.h" /* Our sigaltstack scratch space. */ static unsigned char altstack_data[SIGSTKSZ]; -static unsigned long get_eflags(void) -{ - unsigned long eflags; - asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); - return eflags; -} - -static void set_eflags(unsigned long eflags) -{ - asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH - : : "rm" (eflags) : "flags"); -} - -#define X86_EFLAGS_TF (1UL << 8) - static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index 5fc82b9cebedc9..970e5e14d96d1d 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -13,29 +13,11 @@ #include #include #include -#include -#ifdef __x86_64__ -# define WIDTH "q" -#else -# define WIDTH "l" -#endif +#include "helpers.h" static unsigned int nerrs; -static unsigned long get_eflags(void) -{ - unsigned long eflags; - asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); - return eflags; -} - -static void set_eflags(unsigned long eflags) -{ - asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH - : : "rm" (eflags) : "flags"); -} - static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index a4f4d4cf22c3b4..c41f24b517f401 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -20,6 +20,8 @@ #include #include +#include "helpers.h" + #ifdef __x86_64__ # define VSYS(x) (x) #else @@ -493,21 +495,8 @@ static int test_process_vm_readv(void) } #ifdef __x86_64__ -#define X86_EFLAGS_TF (1UL << 8) static volatile sig_atomic_t num_vsyscall_traps; -static unsigned long get_eflags(void) -{ - unsigned long eflags; - asm volatile ("pushfq\n\tpopq %0" : "=rm" (eflags)); - return eflags; -} - -static void set_eflags(unsigned long eflags) -{ - asm volatile ("pushq %0\n\tpopfq" : : "rm" (eflags) : "flags"); -} - static void sigtrap(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t *)ctx_void; diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c index 0075ccd65407bb..4c311e1af4c7a8 100644 --- a/tools/testing/selftests/x86/unwind_vdso.c +++ b/tools/testing/selftests/x86/unwind_vdso.c @@ -11,6 +11,8 @@ #include #include +#include "helpers.h" + #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ < 16 int main() @@ -53,27 +55,6 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), err(1, "sigaction"); } -#ifdef __x86_64__ -# define WIDTH "q" -#else -# define WIDTH "l" -#endif - -static unsigned long get_eflags(void) -{ - unsigned long eflags; - asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); - return eflags; -} - -static void set_eflags(unsigned long eflags) -{ - asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH - : : "rm" (eflags) : "flags"); -} - -#define X86_EFLAGS_TF (1UL << 8) - static volatile sig_atomic_t nerrs; static unsigned long sysinfo; static bool got_sysinfo = false; From db5b2c5a90a111618f071d231a8b945cf522313e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:53 -0700 Subject: [PATCH 09/13] x86/entry/compat: Clear RAX high bits on Xen PV SYSENTER Move the clearing of the high bits of RAX after Xen PV joins the SYSENTER path so that Xen PV doesn't skip it. Arguably this code should be deleted instead, but that would belong in the merge window. Fixes: ffae641f5747 ("x86/entry/64/compat: Fix Xen PV SYSENTER frame setup") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/9d33b3f3216dcab008070f1c28b6091ae7199969.1593795633.git.luto@kernel.org --- arch/x86/entry/entry_64_compat.S | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 381a6de7de9c6e..541fdaf6404533 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,15 +57,6 @@ SYM_CODE_START(entry_SYSENTER_compat) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq $0 /* pt_regs->sp = 0 (placeholder) */ @@ -80,6 +71,16 @@ SYM_CODE_START(entry_SYSENTER_compat) pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) + + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ + movl %eax, %eax + pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ From 3c73b81a9164d0c1b6379d6672d2772a9e95168e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:54 -0700 Subject: [PATCH 10/13] x86/entry, selftests: Further improve user entry sanity checks Chasing down a Xen bug caused me to realize that the new entry sanity checks are still fairly weak. Add some more checks. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/881de09e786ab93ce56ee4a2437ba2c308afe7a9.1593795633.git.luto@kernel.org --- arch/x86/entry/common.c | 19 +++++++++++++++++++ tools/testing/selftests/x86/syscall_nt.c | 11 +++++++++++ 2 files changed, 30 insertions(+) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f392a8bcd1c3e4..e83b3f14897cc4 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,6 +49,23 @@ static void check_user_regs(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + /* + * Make sure that the entry code gave us a sensible EFLAGS + * register. Native because we want to check the actual CPU + * state, not the interrupt state as imagined by Xen. + */ + unsigned long flags = native_save_fl(); + WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | + X86_EFLAGS_NT)); + + /* We think we came from user mode. Make sure pt_regs agrees. */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * All entries from user mode (except #DF) should be on the + * normal thread stack and should have user pt_regs in the + * correct location. + */ WARN_ON_ONCE(!on_thread_stack()); WARN_ON_ONCE(regs != task_pt_regs(current)); } @@ -577,6 +594,7 @@ SYSCALL_DEFINE0(ni_syscall) bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) { if (user_mode(regs)) { + check_user_regs(regs); enter_from_user_mode(); return false; } @@ -710,6 +728,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) */ void noinstr idtentry_enter_user(struct pt_regs *regs) { + check_user_regs(regs); enter_from_user_mode(); } diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index 970e5e14d96d1d..a108b80dd08230 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -81,5 +81,16 @@ int main(void) printf("[RUN]\tSet NT|AC|TF and issue a syscall\n"); do_it(X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_TF); + /* + * Now try DF. This is evil and it's plausible that we will crash + * glibc, but glibc would have to do something rather surprising + * for this to happen. + */ + printf("[RUN]\tSet DF and issue a syscall\n"); + do_it(X86_EFLAGS_DF); + + printf("[RUN]\tSet TF|DF and issue a syscall\n"); + do_it(X86_EFLAGS_TF | X86_EFLAGS_DF); + return nerrs == 0 ? 0 : 1; } From f41f0824224eb12ad84de8972962dd54be5abe3b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:55 -0700 Subject: [PATCH 11/13] x86/entry/xen: Route #DB correctly on Xen PV On Xen PV, #DB doesn't use IST. It still needs to be correctly routed depending on whether it came from user or kernel mode. Get rid of DECLARE/DEFINE_IDTENTRY_XEN -- it was too hard to follow the logic. Instead, route #DB and NMI through DECLARE/DEFINE_IDTENTRY_RAW on Xen, and do the right thing for #DB. Also add more warnings to the exc_debug* handlers to make this type of failure more obvious. This fixes various forms of corruption that happen when usermode triggers #DB on Xen PV. Fixes: 4c0dcd8350a0 ("x86/entry: Implement user mode C entry points for #DB and #MCE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/4163e733cce0b41658e252c6c6b3464f33fdff17.1593795633.git.luto@kernel.org --- arch/x86/include/asm/idtentry.h | 24 ++++++------------------ arch/x86/kernel/traps.c | 12 ++++++++++++ arch/x86/xen/enlighten_pv.c | 28 ++++++++++++++++++++++++---- arch/x86/xen/xen-asm_64.S | 5 ++--- 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cf51c50eb356dd..94333ac3092b28 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -398,18 +398,6 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST -/** - * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points - * @vector: Vector number (ignored for C) - * @func: Function name of the entry point - * - * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM - * indirection magic. - */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - asmlinkage void xen_asm_exc_xen##func(void); \ - asmlinkage void asm_exc_xen##func(void) - #else /* !__ASSEMBLY__ */ /* @@ -469,10 +457,6 @@ __visible noinstr void func(struct pt_regs *regs, \ /* No ASM code emitted for NMI */ #define DECLARE_IDTENTRY_NMI(vector, func) -/* XEN NMI and DB wrapper */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - idtentry vector asm_exc_xen##func exc_##func has_error_code=0 - /* * ASM code to emit the common vector entry stubs where each stub is * packed into 8 bytes. @@ -570,11 +554,15 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); /* NMI */ DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); -DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); +#endif /* #DB */ DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); -DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); +#endif /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f9727b96961fb6..c17f9b57171f5b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -865,6 +865,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, instrumentation_begin(); trace_hardirqs_off_finish(); + /* + * If something gets miswired and we end up here for a user mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(user_mode(regs)); + /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. @@ -883,6 +889,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { + /* + * If something gets miswired and we end up here for a kernel mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(!user_mode(regs)); + idtentry_enter_user(regs); instrumentation_begin(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index acc49fa6a0971d..0d68948c82ad6f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -598,6 +598,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, } #ifdef CONFIG_X86_64 +void noist_exc_debug(struct pt_regs *regs); + +DEFINE_IDTENTRY_RAW(xenpv_exc_nmi) +{ + /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */ + exc_nmi(regs); +} + +DEFINE_IDTENTRY_RAW(xenpv_exc_debug) +{ + /* + * There's no IST on Xen PV, but we still need to dispatch + * to the correct handler. + */ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -609,18 +629,18 @@ struct trap_array_entry { .xen = xen_asm_##func, \ .ist_okay = ist_ok } -#define TRAP_ENTRY_REDIR(func, xenfunc, ist_ok) { \ +#define TRAP_ENTRY_REDIR(func, ist_ok) { \ .orig = asm_##func, \ - .xen = xen_asm_##xenfunc, \ + .xen = xen_asm_xenpv_##func, \ .ist_okay = ist_ok } static struct trap_array_entry trap_array[] = { - TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ), + TRAP_ENTRY_REDIR(exc_debug, true ), TRAP_ENTRY(exc_double_fault, true ), #ifdef CONFIG_X86_MCE TRAP_ENTRY(exc_machine_check, true ), #endif - TRAP_ENTRY_REDIR(exc_nmi, exc_xennmi, true ), + TRAP_ENTRY_REDIR(exc_nmi, true ), TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index e1e1c7eafa60b7..aab1d99b2b480b 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,10 +29,9 @@ _ASM_NOKPROBE(xen_\name) .endm xen_pv_trap asm_exc_divide_error -xen_pv_trap asm_exc_debug -xen_pv_trap asm_exc_xendebug +xen_pv_trap asm_xenpv_exc_debug xen_pv_trap asm_exc_int3 -xen_pv_trap asm_exc_xennmi +xen_pv_trap asm_xenpv_exc_nmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op From 13cbc0cd4a30c815984ad88e3a2e5976493516a3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:56 -0700 Subject: [PATCH 12/13] x86/entry/32: Fix #MC and #DB wiring on x86_32 DEFINE_IDTENTRY_MCE and DEFINE_IDTENTRY_DEBUG were wired up as non-RAW on x86_32, but the code expected them to be RAW. Get rid of all the macro indirection for them on 32-bit and just use DECLARE_IDTENTRY_RAW and DEFINE_IDTENTRY_RAW directly. Also add a warning to make sure that we only hit the _kernel paths in kernel mode. Reported-by: Naresh Kamboju Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/9e90a7ee8e72fd757db6d92e1e5ff16339c1ecf9.1593795633.git.luto@kernel.org --- arch/x86/include/asm/idtentry.h | 23 +++++++++++++---------- arch/x86/kernel/cpu/mce/core.c | 4 +++- arch/x86/kernel/traps.c | 2 +- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 94333ac3092b28..eeac6dc2adaa3b 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -353,10 +353,6 @@ static __always_inline void __##func(struct pt_regs *regs) #else /* CONFIG_X86_64 */ -/* Maps to a regular IDTENTRY on 32bit for now */ -# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY -# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY - /** * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant * @vector: Vector number (ignored for C) @@ -387,16 +383,18 @@ __visible noinstr void func(struct pt_regs *regs, \ #endif /* !CONFIG_X86_64 */ /* C-Code mapping */ +#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW +#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW + +#ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST -#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW -#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW - #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#endif #else /* !__ASSEMBLY__ */ @@ -443,9 +441,6 @@ __visible noinstr void func(struct pt_regs *regs, \ # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) -# define DECLARE_IDTENTRY_DEBUG(vector, func) \ - DECLARE_IDTENTRY(vector, func) - /* No ASM emitted for DF as this goes through a C shim */ # define DECLARE_IDTENTRY_DF(vector, func) @@ -549,7 +544,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); #ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); +#endif #endif /* NMI */ @@ -559,7 +558,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); #endif /* #DB */ +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug); +#endif #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); #endif diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ce9120c4f74094..a6a90b5d7c83b0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1901,6 +1901,8 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + WARN_ON_ONCE(user_mode(regs)); + /* * Only required when from kernel mode. See * mce_check_crashing_cpu() for details. @@ -1954,7 +1956,7 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) } #else /* 32bit unified entry point */ -DEFINE_IDTENTRY_MCE(exc_machine_check) +DEFINE_IDTENTRY_RAW(exc_machine_check) { unsigned long dr7; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c17f9b57171f5b..6ed8cc5fbe8fc1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -925,7 +925,7 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) } #else /* 32 bit does not have separate entry points. */ -DEFINE_IDTENTRY_DEBUG(exc_debug) +DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6, dr7; From cc801833a171163edb6385425349ba8903bd1b20 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:57 -0700 Subject: [PATCH 13/13] x86/ldt: Disable 16-bit segments on Xen PV Xen PV doesn't implement ESPFIX64, so they don't work right. Disable them. Also print a warning the first time anyone tries to use a 16-bit segment on a Xen PV guest that would otherwise allow it to help people diagnose this change in behavior. This gets us closer to having all x86 selftests pass on Xen PV. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/92b2975459dfe5929ecf34c3896ad920bd9e3f2d.1593795633.git.luto@kernel.org --- arch/x86/kernel/ldt.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 8748321c448674..34e918ad34d4ce 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -29,6 +29,8 @@ #include #include +#include + /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) @@ -543,6 +545,37 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) return bytecount; } +static bool allow_16bit_segments(void) +{ + if (!IS_ENABLED(CONFIG_X86_16BIT)) + return false; + +#ifdef CONFIG_XEN_PV + /* + * Xen PV does not implement ESPFIX64, which means that 16-bit + * segments will not work correctly. Until either Xen PV implements + * ESPFIX64 and can signal this fact to the guest or unless someone + * provides compelling evidence that allowing broken 16-bit segments + * is worthwhile, disallow 16-bit segments under Xen PV. + */ + if (xen_pv_domain()) { + static DEFINE_MUTEX(xen_warning); + static bool warned; + + mutex_lock(&xen_warning); + if (!warned) { + pr_info("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); + warned = true; + } + mutex_unlock(&xen_warning); + + return false; + } +#endif + + return true; +} + static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; @@ -574,7 +607,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; }