Skip to content

Commit

Permalink
Merge tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/lin…
Browse files Browse the repository at this point in the history
…ux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:

 - more generic entry code ABI fallout

 - debug register handling bugfixes

 - fix vmalloc mappings on 32-bit kernels

 - kprobes instrumentation output fix on 32-bit kernels

 - fix over-eager WARN_ON_ONCE() on !SMAP hardware

 - NUMA debugging fix

 - fix Clang related crash on !RETPOLINE kernels

* tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/entry: Unbreak 32bit fast syscall
  x86/debug: Allow a single level of #DB recursion
  x86/entry: Fix AC assertion
  tracing/kprobes, x86/ptrace: Fix regs argument order for i386
  x86, fakenuma: Fix invalid starting node ID
  x86/mm/32: Bring back vmalloc faulting on x86_32
  x86/cmdline: Disable jump tables for cmdline.c
  • Loading branch information
torvalds committed Sep 6, 2020
2 parents 68beef5 + 4facb95 commit 015b315
Show file tree
Hide file tree
Showing 9 changed files with 213 additions and 63 deletions.
29 changes: 20 additions & 9 deletions arch/x86/entry/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,10 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
{
unsigned int nr = (unsigned int)regs->orig_ax;

if (IS_ENABLED(CONFIG_IA32_EMULATION))
current_thread_info()->status |= TS_COMPAT;
/*
* Subtlety here: if ptrace pokes something larger than 2^32-1 into
* orig_ax, the unsigned int return value truncates it. This may
* or may not be necessary, but it matches the old asm behavior.
*/
return (unsigned int)syscall_enter_from_user_mode(regs, nr);

return (unsigned int)regs->orig_ax;
}

/*
Expand All @@ -91,15 +85,29 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
unsigned int nr = syscall_32_enter(regs);

/*
* Subtlety here: if ptrace pokes something larger than 2^32-1 into
* orig_ax, the unsigned int return value truncates it. This may
* or may not be necessary, but it matches the old asm behavior.
*/
nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);

do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs);
}

static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
unsigned int nr = syscall_32_enter(regs);
unsigned int nr = syscall_32_enter(regs);
int res;

/*
* This cannot use syscall_enter_from_user_mode() as it has to
* fetch EBP before invoking any of the syscall entry work
* functions.
*/
syscall_enter_from_user_mode_prepare(regs);

instrumentation_begin();
/* Fetch EBP from where the vDSO stashed it. */
if (IS_ENABLED(CONFIG_X86_64)) {
Expand All @@ -122,6 +130,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
return false;
}

/* The case truncates any ptrace induced syscall nr > 2^32 -1 */
nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);

/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs);
Expand Down
12 changes: 10 additions & 2 deletions arch/x86/include/asm/entry-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
* state, not the interrupt state as imagined by Xen.
*/
unsigned long flags = native_save_fl();
WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF |
X86_EFLAGS_NT));
unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT;

/*
* For !SMAP hardware we patch out CLAC on entry.
*/
if (boot_cpu_has(X86_FEATURE_SMAP) ||
(IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
mask |= X86_EFLAGS_AC;

WARN_ON_ONCE(flags & mask);

/* We think we came from user mode. Make sure pt_regs agrees. */
WARN_ON_ONCE(!user_mode(regs));
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/include/asm/ptrace.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,8 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
static const unsigned int argument_offs[] = {
#ifdef __i386__
offsetof(struct pt_regs, ax),
offsetof(struct pt_regs, cx),
offsetof(struct pt_regs, dx),
offsetof(struct pt_regs, cx),
#define NR_REG_ARGUMENTS 3
#else
offsetof(struct pt_regs, di),
Expand Down
65 changes: 31 additions & 34 deletions arch/x86/kernel/traps.c
Original file line number Diff line number Diff line change
Expand Up @@ -729,20 +729,9 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
#endif
}

static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
static __always_inline unsigned long debug_read_clear_dr6(void)
{
/*
* Disable breakpoints during exception handling; recursive exceptions
* are exceedingly 'fun'.
*
* Since this function is NOKPROBE, and that also applies to
* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
* HW_BREAKPOINT_W on our stack)
*
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
* includes the entry stack is excluded for everything.
*/
*dr7 = local_db_save();
unsigned long dr6;

/*
* The Intel SDM says:
Expand All @@ -755,15 +744,12 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
*
* Keep it simple: clear DR6 immediately.
*/
get_debugreg(*dr6, 6);
get_debugreg(dr6, 6);
set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */
*dr6 &= ~DR6_RESERVED;
}
dr6 &= ~DR6_RESERVED;

static __always_inline void debug_exit(unsigned long dr7)
{
local_db_restore(dr7);
return dr6;
}

/*
Expand Down Expand Up @@ -863,6 +849,18 @@ static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
static __always_inline void exc_debug_kernel(struct pt_regs *regs,
unsigned long dr6)
{
/*
* Disable breakpoints during exception handling; recursive exceptions
* are exceedingly 'fun'.
*
* Since this function is NOKPROBE, and that also applies to
* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
* HW_BREAKPOINT_W on our stack)
*
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
* includes the entry stack is excluded for everything.
*/
unsigned long dr7 = local_db_save();
bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();

Expand All @@ -883,6 +881,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,

instrumentation_end();
idtentry_exit_nmi(regs, irq_state);

local_db_restore(dr7);
}

static __always_inline void exc_debug_user(struct pt_regs *regs,
Expand All @@ -894,6 +894,15 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
*/
WARN_ON_ONCE(!user_mode(regs));

/*
* NB: We can't easily clear DR7 here because
* idtentry_exit_to_usermode() can invoke ptrace, schedule, access
* user memory, etc. This means that a recursive #DB is possible. If
* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
* Since we're not on the IST stack right now, everything will be
* fine.
*/

irqentry_enter_from_user_mode(regs);
instrumentation_begin();

Expand All @@ -907,36 +916,24 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
/* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{
unsigned long dr6, dr7;

debug_enter(&dr6, &dr7);
exc_debug_kernel(regs, dr6);
debug_exit(dr7);
exc_debug_kernel(regs, debug_read_clear_dr6());
}

/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
unsigned long dr6, dr7;

debug_enter(&dr6, &dr7);
exc_debug_user(regs, dr6);
debug_exit(dr7);
exc_debug_user(regs, debug_read_clear_dr6());
}
#else
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug)
{
unsigned long dr6, dr7;

debug_enter(&dr6, &dr7);
unsigned long dr6 = debug_read_clear_dr6();

if (user_mode(regs))
exc_debug_user(regs, dr6);
else
exc_debug_kernel(regs, dr6);

debug_exit(dr7);
}
#endif

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_cmdline.o = -pg
endif

CFLAGS_cmdline.o := -fno-stack-protector
CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables
endif

inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
Expand Down
78 changes: 78 additions & 0 deletions arch/x86/mm/fault.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
return pmd_k;
}

/*
* Handle a fault on the vmalloc or module mapping area
*
* This is needed because there is a race condition between the time
* when the vmalloc mapping code updates the PMD to the point in time
* where it synchronizes this update with the other page-tables in the
* system.
*
* In this race window another thread/CPU can map an area on the same
* PMD, finds it already present and does not synchronize it with the
* rest of the system yet. As a result v[mz]alloc might return areas
* which are not mapped in every page-table in the system, causing an
* unhandled page-fault when they are accessed.
*/
static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
pte_t *pte_k;

/* Make sure we are in vmalloc area: */
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;

/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
*
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;

if (pmd_large(*pmd_k))
return 0;

pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
return -1;

return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);

void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
unsigned long addr;
Expand Down Expand Up @@ -1110,6 +1157,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);

#ifdef CONFIG_X86_32
/*
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* Before doing this on-demand faulting, ensure that the
* fault is not any of the following:
* 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*
* This is only needed to close a race condition on x86-32 in
* the vmalloc mapping/unmapping code. See the comment above
* vmalloc_fault() for details. On x86-64 the race does not
* exist as the vmalloc mappings don't need to be synchronized
* there.
*/
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
#endif

/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/mm/numa_emulation.c
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
u64 addr, u64 max_addr, u64 size)
{
return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
0, NULL, NUMA_NO_NODE);
0, NULL, 0);
}

static int __init setup_emu2phys_nid(int *dfl_phys_nid)
Expand Down
51 changes: 42 additions & 9 deletions include/linux/entry-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
#endif

/**
* syscall_enter_from_user_mode - Check and handle work before invoking
* a syscall
* syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
* @regs: Pointer to currents pt_regs
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
* disabled. The calling code has to be non-instrumentable. When the
* function returns all state is correct and the subsequent functions can be
* instrumented.
* function returns all state is correct, interrupts are enabled and the
* subsequent functions can be instrumented.
*
* This handles lockdep, RCU (context tracking) and tracing state.
*
* This is invoked when there is extra architecture specific functionality
* to be done between establishing state and handling user mode entry work.
*/
void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);

/**
* syscall_enter_from_user_mode_work - Check and handle work before invoking
* a syscall
* @regs: Pointer to currents pt_regs
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
* enabled after invoking syscall_enter_from_user_mode_prepare() and extra
* architecture specific work.
*
* Returns: The original or a modified syscall number
*
Expand All @@ -127,12 +142,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
* syscall_set_return_value() first. If neither of those are called and -1
* is returned, then the syscall will fail with ENOSYS.
*
* The following functionality is handled here:
* It handles the following work items:
*
* 1) Establish state (lockdep, RCU (context tracking), tracing)
* 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
* 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
* __secure_computing(), trace_sys_enter()
* 3) Invocation of audit_syscall_entry()
* 2) Invocation of audit_syscall_entry()
*/
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);

/**
* syscall_enter_from_user_mode - Establish state and check and handle work
* before invoking a syscall
* @regs: Pointer to currents pt_regs
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
* disabled. The calling code has to be non-instrumentable. When the
* function returns all state is correct, interrupts are enabled and the
* subsequent functions can be instrumented.
*
* This is combination of syscall_enter_from_user_mode_prepare() and
* syscall_enter_from_user_mode_work().
*
* Returns: The original or a modified syscall number. See
* syscall_enter_from_user_mode_work() for further explanation.
*/
long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);

Expand Down
Loading

0 comments on commit 015b315

Please sign in to comment.