Skip to content

Commit

Permalink
x86/mm: Clarify hardware vs. software "error_code"
Browse files Browse the repository at this point in the history
We pass around a variable called "error_code" all around the page
fault code.  Sounds simple enough, especially since "error_code" looks
like it exactly matches the values that the hardware gives us on the
stack to report the page fault error code (PFEC in SDM parlance).

But, that's not how it works.

For part of the page fault handler, "error_code" does exactly match
PFEC.  But, during later parts, it diverges and starts to mean
something a bit different.

Give it two names for its two jobs.

The place it diverges is also really screwy.  It's only in a spot
where the hardware tells us we have kernel-mode access that occurred
while we were in usermode accessing user-controlled address space.
Add a warning in there.

Cc: [email protected]
Cc: Jann Horn <[email protected]>
Cc: Sean Christopherson <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
  • Loading branch information
hansendc authored and Peter Zijlstra committed Oct 9, 2018
1 parent 145f573 commit 164477c
Showing 1 changed file with 52 additions and 25 deletions.
77 changes: 52 additions & 25 deletions arch/x86/mm/fault.c
Original file line number Diff line number Diff line change
Expand Up @@ -1208,9 +1208,10 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
* routines.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
Expand All @@ -1236,17 +1237,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
* (hw_error_code & 4) == 0, and that the fault was not a
* protection error (hw_error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}

/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
if (spurious_fault(hw_error_code, address))
return;

/* kprobes don't want to hook the spurious faults: */
Expand All @@ -1256,7 +1257,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address, NULL);
bad_area_nosemaphore(regs, hw_error_code, address, NULL);

return;
}
Expand All @@ -1265,11 +1266,11 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(kprobes_fault(regs)))
return;

if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
if (unlikely(hw_error_code & X86_PF_RSVD))
pgtable_bad(regs, hw_error_code, address);

if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address, NULL);
if (unlikely(smap_violation(hw_error_code, regs))) {
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}

Expand All @@ -1278,10 +1279,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}

/*
* hw_error_code is literally the "page fault error code" passed to
* the kernel directly from the hardware. But, we will shortly be
* modifying it in software, so give it a new name.
*/
sw_error_code = hw_error_code;

/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
Expand All @@ -1291,7 +1299,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
error_code |= X86_PF_USER;
/*
* Up to this point, X86_PF_USER set in hw_error_code
* indicated a user-mode access. But, after this,
* X86_PF_USER in sw_error_code will indicate either
* that, *or* an implicit kernel(supervisor)-mode access
* which originated from user mode.
*/
if (!(hw_error_code & X86_PF_USER)) {
/*
* The CPU was in user mode, but the CPU says
* the fault was not a user-mode access.
* Must be an implicit kernel-mode access,
* which we do not expect to happen in the
* user address space.
*/
pr_warn_once("kernel-mode error from user-mode: %lx\n",
hw_error_code);

sw_error_code |= X86_PF_USER;
}
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
Expand All @@ -1300,9 +1327,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,

perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

if (error_code & X86_PF_WRITE)
if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;

/*
Expand All @@ -1322,9 +1349,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!(error_code & X86_PF_USER) &&
if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
bad_area_nosemaphore(regs, sw_error_code, address, NULL);
return;
}
retry:
Expand All @@ -1340,29 +1367,29 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,

vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (error_code & X86_PF_USER) {
if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}

Expand All @@ -1371,8 +1398,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* we can handle it..
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
if (unlikely(access_error(sw_error_code, vma))) {
bad_area_access_error(regs, sw_error_code, address, vma);
return;
}

Expand Down Expand Up @@ -1414,13 +1441,13 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
return;

/* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}

up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault);
mm_fault_error(regs, sw_error_code, address, &pkey, fault);
return;
}

Expand Down

0 comments on commit 164477c

Please sign in to comment.