Skip to content

Commit

Permalink
Merge tag 'x86_urgent_for_v5.16_rc4' of git://git.kernel.org/pub/scm/…
Browse files Browse the repository at this point in the history
…linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:

 - Fix a couple of SWAPGS fencing issues in the x86 entry code

 - Use the proper operand types in __{get,put}_user() to prevent
   truncation in SEV-ES string io

 - Make sure the kernel mappings are present in trampoline_pgd in order
   to prevent any potential accesses to unmapped memory after switching
   to it

 - Fix a trivial list corruption in objtool's pv_ops validation

 - Disable the clocksource watchdog for TSC on platforms which claim
   that the TSC is constant, doesn't stop in sleep states, CPU has TSC
   adjust and the number of sockets of the platform are max 2, to
   prevent erroneous markings of the TSC as unstable.

 - Make sure TSC adjust is always checked not only when going idle

 - Prevent a stack leak by initializing struct _fpx_sw_bytes properly in
   the FPU code

 - Fix INTEL_FAM6_RAPTORLAKE define naming to adhere to the convention

* tag 'x86_urgent_for_v5.16_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/xen: Add xenpv_restore_regs_and_return_to_usermode()
  x86/entry: Use the correct fence macro after swapgs in kernel CR3
  x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()
  x86/sev: Fix SEV-ES INS/OUTS instructions for word, dword, and qword
  x86/64/mm: Map all kernel memory into trampoline_pgd
  objtool: Fix pv_ops noinstr validation
  x86/tsc: Disable clocksource watchdog for TSC on qualified platorms
  x86/tsc: Add a timer to make sure TSC_adjust is always checked
  x86/fpu/signal: Initialize sw_bytes in save_xstate_epilog()
  x86/cpu: Drop spurious underscore from RAPTOR_LAKE #define
  • Loading branch information
torvalds committed Dec 5, 2021
2 parents 90bf8d9 + 5c8f6a2 commit f5d54a4
Show file tree
Hide file tree
Showing 10 changed files with 159 additions and 43 deletions.
35 changes: 17 additions & 18 deletions arch/x86/entry/entry_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif

POP_REGS pop_rdi=0

/*
Expand Down Expand Up @@ -890,28 +894,22 @@ SYM_CODE_START_LOCAL(paranoid_entry)
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx

/*
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
*/
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
jns .Lparanoid_entry_swapgs
ret
js .Lparanoid_kernel_gsbase

.Lparanoid_entry_swapgs:
/* EBX = 0 -> SWAPGS required on exit */
xorl %ebx, %ebx
swapgs
.Lparanoid_kernel_gsbase:

/*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
* unconditional CR3 write, even in the PTI case. So do an lfence
* to prevent GS speculation, regardless of whether PTI is enabled.
*/
FENCE_SWAPGS_KERNEL_ENTRY

/* EBX = 0 -> SWAPGS required on exit */
xorl %ebx, %ebx
ret
SYM_CODE_END(paranoid_entry)

Expand Down Expand Up @@ -993,11 +991,6 @@ SYM_CODE_START_LOCAL(error_entry)
pushq %r12
ret

.Lerror_entry_done_lfence:
FENCE_SWAPGS_KERNEL_ENTRY
.Lerror_entry_done:
ret

/*
* There are two places in the kernel that can potentially fault with
* usergs. Handle them here. B stepping K8s sometimes report a
Expand All @@ -1020,8 +1013,14 @@ SYM_CODE_START_LOCAL(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
FENCE_SWAPGS_USER_ENTRY
jmp .Lerror_entry_done

/*
* Issue an LFENCE to prevent GS speculation, regardless of whether it is a
* kernel or user gsbase.
*/
.Lerror_entry_done_lfence:
FENCE_SWAPGS_KERNEL_ENTRY
ret

.Lbstep_iret:
/* Fix truncated RIP */
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/include/asm/intel-family.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */

#define INTEL_FAM6_RAPTOR_LAKE 0xB7
#define INTEL_FAM6_RAPTORLAKE 0xB7

/* "Small Core" Processors (Atom) */

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/fpu/signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
struct _fpx_sw_bytes sw_bytes;
struct _fpx_sw_bytes sw_bytes = {};
u32 xfeatures;
int err;

Expand Down
57 changes: 39 additions & 18 deletions arch/x86/kernel/sev.c
Original file line number Diff line number Diff line change
Expand Up @@ -294,11 +294,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
char *dst, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
char __user *target = (char __user *)dst;
u64 d8;
u32 d4;
u16 d2;
u8 d1;

/*
* This function uses __put_user() independent of whether kernel or user
Expand All @@ -320,26 +315,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
case 1:
case 1: {
u8 d1;
u8 __user *target = (u8 __user *)dst;

memcpy(&d1, buf, 1);
if (__put_user(d1, target))
goto fault;
break;
case 2:
}
case 2: {
u16 d2;
u16 __user *target = (u16 __user *)dst;

memcpy(&d2, buf, 2);
if (__put_user(d2, target))
goto fault;
break;
case 4:
}
case 4: {
u32 d4;
u32 __user *target = (u32 __user *)dst;

memcpy(&d4, buf, 4);
if (__put_user(d4, target))
goto fault;
break;
case 8:
}
case 8: {
u64 d8;
u64 __user *target = (u64 __user *)dst;

memcpy(&d8, buf, 8);
if (__put_user(d8, target))
goto fault;
break;
}
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
Expand All @@ -362,11 +373,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
char *src, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT;
char __user *s = (char __user *)src;
u64 d8;
u32 d4;
u16 d2;
u8 d1;

/*
* This function uses __get_user() independent of whether kernel or user
Expand All @@ -388,26 +394,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
case 1:
case 1: {
u8 d1;
u8 __user *s = (u8 __user *)src;

if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
case 2:
}
case 2: {
u16 d2;
u16 __user *s = (u16 __user *)src;

if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
case 4:
}
case 4: {
u32 d4;
u32 __user *s = (u32 __user *)src;

if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
case 8:
}
case 8: {
u64 d8;
u64 __user *s = (u64 __user *)src;
if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
}
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
Expand Down
28 changes: 24 additions & 4 deletions arch/x86/kernel/tsc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)

EXPORT_SYMBOL_GPL(mark_tsc_unstable);

static void __init tsc_disable_clocksource_watchdog(void)
{
clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}

static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
Expand All @@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;

/*
* Disable the clocksource watchdog when the system has:
* - TSC running at constant frequency
* - TSC which does not stop in C-States
* - the TSC_ADJUST register which allows to detect even minimal
* modifications
* - not more than two sockets. As the number of sockets cannot be
* evaluated at the early boot stage where this has to be
* invoked, check the number of online memory nodes as a
* fallback solution which is an reasonable estimate.
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
nr_online_nodes <= 2)
tsc_disable_clocksource_watchdog();
}

/*
Expand Down Expand Up @@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;

if (tsc_clocksource_reliable || no_tsc_watchdog)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;

if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;

Expand Down Expand Up @@ -1527,7 +1547,7 @@ void __init tsc_init(void)
}

if (tsc_clocksource_reliable || no_tsc_watchdog)
clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
tsc_disable_clocksource_watchdog();

clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
Expand Down
41 changes: 41 additions & 0 deletions arch/x86/kernel/tsc_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ struct tsc_adjust {
};

static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
static struct timer_list tsc_sync_check_timer;

/*
* TSC's on different sockets may be reset asynchronously.
Expand Down Expand Up @@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}

/*
* Normally the tsc_sync will be checked every time system enters idle
* state, but there is still caveat that a system won't enter idle,
* either because it's too busy or configured purposely to not enter
* idle.
*
* So setup a periodic timer (every 10 minutes) to make sure the check
* is always on.
*/

#define SYNC_CHECK_INTERVAL (HZ * 600)

static void tsc_sync_check_timer_fn(struct timer_list *unused)
{
int next_cpu;

tsc_verify_tsc_adjust(false);

/* Run the check for all onlined CPUs in turn */
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);

tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
add_timer_on(&tsc_sync_check_timer, next_cpu);
}

static int __init start_sync_check_timer(void)
{
if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
return 0;

timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
add_timer(&tsc_sync_check_timer);

return 0;
}
late_initcall(start_sync_check_timer);

static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
Expand Down
12 changes: 11 additions & 1 deletion arch/x86/realmode/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
int i;
#endif

base = (unsigned char *)real_mode_header;
Expand Down Expand Up @@ -128,8 +129,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;

trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);

/* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
trampoline_pgd[511] = init_top_pgt[511].pgd;

/*
* Include the entirety of the kernel mapping into the trampoline
* PGD. This way, all mappings present in the normal kernel page
* tables are usable while running on trampoline_pgd.
*/
for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif

sme_sev_setup_real_mode(trampoline_header);
Expand Down
20 changes: 20 additions & 0 deletions arch/x86/xen/xen-asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <linux/init.h>
#include <linux/linkage.h>
#include <../entry/calling.h>

.pushsection .noinstr.text, "ax"
/*
Expand Down Expand Up @@ -192,6 +193,25 @@ SYM_CODE_START(xen_iret)
jmp hypercall_iret
SYM_CODE_END(xen_iret)

/*
* XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
* also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
* in XEN pv would cause %rsp to move up to the top of the kernel stack and
* leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
* interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
* frame at the same address is useless.
*/
SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
UNWIND_HINT_REGS
POP_REGS

/* stackleak_erase() can work safely on the kernel stack. */
STACKLEAK_ERASE_NOCLOBBER

addq $8, %rsp /* skip regs->orig_ax */
jmp xen_iret
SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)

/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
Expand Down
1 change: 1 addition & 0 deletions tools/objtool/elf.c
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ static int read_symbols(struct elf *elf)
return -1;
}
memset(sym, 0, sizeof(*sym));
INIT_LIST_HEAD(&sym->pv_target);
sym->alias = sym;

sym->idx = i;
Expand Down
4 changes: 4 additions & 0 deletions tools/objtool/objtool.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ void objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func)
!strcmp(func->name, "_paravirt_ident_64"))
return;

/* already added this function */
if (!list_empty(&func->pv_target))
return;

list_add(&func->pv_target, &f->pv_ops[idx].targets);
f->pv_ops[idx].clean = false;
}
Expand Down

0 comments on commit f5d54a4

Please sign in to comment.