Skip to content

Commit

Permalink
Merge branch 'x86-seccomp-for-linus' of git://git.kernel.org/pub/scm/…
Browse files Browse the repository at this point in the history
…linux/kernel/git/tip/tip

Pull x86 seccomp changes from Ingo Molnar:
 "This tree includes x86 seccomp filter speedups and related preparatory
  work, which touches core seccomp facilities as well.

  The main idea is to split seccomp into two phases, to be able to enter
  a simple fast path for syscalls with ptrace side effects.

  There's no substantial user-visible (and ABI) effects expected from
  this, except a change in how we emit a better audit record for
  SECCOMP_RET_TRACE events"

* 'x86-seccomp-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86_64, entry: Use split-phase syscall_trace_enter for 64-bit syscalls
  x86_64, entry: Treat regs->ax the same in fastpath and slowpath syscalls
  x86: Split syscall_trace_enter into two phases
  x86, entry: Only call user_exit if TIF_NOHZ
  x86, x32, audit: Fix x32's AUDIT_ARCH wrt audit
  seccomp: Document two-phase seccomp and arch-provided seccomp_data
  seccomp: Allow arch code to provide seccomp_data
  seccomp: Refactor the filter callback and the API
  seccomp,x86,arm,mips,s390: Remove nr parameter from secure_computing
  • Loading branch information
torvalds committed Oct 14, 2014
2 parents f1bfbd9 + 1dcf74f commit ba1a96f
Show file tree
Hide file tree
Showing 11 changed files with 371 additions and 157 deletions.
11 changes: 11 additions & 0 deletions arch/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,17 @@ config HAVE_ARCH_SECCOMP_FILTER
results in the system call being skipped immediately.
- seccomp syscall wired up

For best performance, an arch should use seccomp_phase1 and
seccomp_phase2 directly. It should call seccomp_phase1 for all
syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not
need to be called from a ptrace-safe context. It must then
call seccomp_phase2 if seccomp_phase1 returns anything other
than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP.

As an additional optimization, an arch may provide seccomp_data
directly to seccomp_phase1; this avoids multiple calls
to the syscall_xyz helpers for every syscall.

config SECCOMP_FILTER
def_bool y
depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
Expand Down
7 changes: 6 additions & 1 deletion arch/arm/kernel/ptrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -933,8 +933,13 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
current_thread_info()->syscall = scno;

/* Do the secure computing check first; failures should be fast. */
if (secure_computing(scno) == -1)
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
if (secure_computing() == -1)
return -1;
#else
/* XXX: remove this once OABI gets fixed */
secure_computing_strict(scno);
#endif

if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
Expand Down
2 changes: 1 addition & 1 deletion arch/mips/kernel/ptrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
long ret = 0;
user_exit();

if (secure_computing(syscall) == -1)
if (secure_computing() == -1)
return -1;

if (test_thread_flag(TIF_SYSCALL_TRACE) &&
Expand Down
2 changes: 1 addition & 1 deletion arch/s390/kernel/ptrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
long ret = 0;

/* Do the secure computing check first. */
if (secure_computing(regs->gprs[2])) {
if (secure_computing()) {
/* seccomp failures shouldn't expose any additional code. */
ret = -1;
goto out;
Expand Down
6 changes: 5 additions & 1 deletion arch/x86/include/asm/calling.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ For 32-bit we have the following conventions - kernel is built with
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX

.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
subq $9*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 9*8+\addskip
movq_cfi rdi, 8*8
Expand All @@ -96,7 +96,11 @@ For 32-bit we have the following conventions - kernel is built with
movq_cfi rcx, 5*8
.endif

.if \rax_enosys
movq $-ENOSYS, 4*8(%rsp)
.else
movq_cfi rax, 4*8
.endif

.if \save_r891011
movq_cfi r8, 3*8
Expand Down
5 changes: 5 additions & 0 deletions arch/x86/include/asm/ptrace.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
int error_code, int si_code);


extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
unsigned long phase1_result);

extern long syscall_trace_enter(struct pt_regs *);
extern void syscall_trace_leave(struct pt_regs *);

Expand Down
51 changes: 19 additions & 32 deletions arch/x86/kernel/entry_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,8 @@ GLOBAL(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
SAVE_ARGS 8, 0, rax_enosys=1
movq_cfi rax,(ORIG_RAX-ARGOFFSET)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
Expand All @@ -417,7 +417,7 @@ system_call_fastpath:
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
ja badsys
ja ret_from_sys_call /* and return regs->ax */
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
movq %rax,RAX-ARGOFFSET(%rsp)
Expand Down Expand Up @@ -476,27 +476,7 @@ sysret_signal:
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
jmp int_check_syscall_exit_work

badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp ret_from_sys_call

#ifdef CONFIG_AUDITSYSCALL
/*
* Fast path for syscall audit without full syscall trace.
* We just call __audit_syscall_entry() directly, and then
* jump back to the normal fast path.
*/
auditsys:
movq %r10,%r9 /* 6th arg: 4th syscall arg */
movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
movq %rax,%rsi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
call __audit_syscall_entry
LOAD_ARGS 0 /* reload call-clobbered registers */
jmp system_call_fastpath

/*
* Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
Expand All @@ -514,18 +494,25 @@ sysret_audit:

/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
leaq -REST_SKIP(%rsp), %rdi
movq $AUDIT_ARCH_X86_64, %rsi
call syscall_trace_enter_phase1
test %rax, %rax
jnz tracesys_phase2 /* if needed, run the slow path */
LOAD_ARGS 0 /* else restore clobbered regs */
jmp system_call_fastpath /* and return to the fast path */

tracesys_phase2:
SAVE_REST
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
FIXUP_TOP_OF_STACK %rdi
movq %rsp,%rdi
call syscall_trace_enter
movq %rsp, %rdi
movq $AUDIT_ARCH_X86_64, %rsi
movq %rax,%rdx
call syscall_trace_enter_phase2

/*
* Reload arg registers from stack in case ptrace changed them.
* We don't reload %rax because syscall_trace_enter() returned
* We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
LOAD_ARGS ARGOFFSET, 1
Expand All @@ -536,7 +523,7 @@ tracesys:
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
ja int_ret_from_sys_call /* RAX(%rsp) is already set */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
Expand Down
165 changes: 135 additions & 30 deletions arch/x86/kernel/ptrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -1441,24 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
force_sig_info(SIGTRAP, &info, tsk);
}


#ifdef CONFIG_X86_32
# define IS_IA32 1
#elif defined CONFIG_IA32_EMULATION
# define IS_IA32 is_compat_task()
#else
# define IS_IA32 0
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
audit_syscall_entry(arch, regs->orig_ax, regs->di,
regs->si, regs->dx, regs->r10);
} else
#endif
{
audit_syscall_entry(arch, regs->orig_ax, regs->bx,
regs->cx, regs->dx, regs->si);
}
}

/*
* We must return the syscall number to actually look up in the table.
* This can be -1L to skip running any syscall at all.
* We can return 0 to resume the syscall or anything else to go to phase
* 2. If we resume the syscall, we need to put something appropriate in
* regs->orig_ax.
*
* NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
* are fully functional.
*
* For phase 2's benefit, our return value is:
* 0: resume the syscall
* 1: go to phase 2; no seccomp phase 2 needed
* anything else: go to phase 2; pass return value to seccomp
*/
long syscall_trace_enter(struct pt_regs *regs)
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{
unsigned long ret = 0;
u32 work;

BUG_ON(regs != task_pt_regs(current));

work = ACCESS_ONCE(current_thread_info()->flags) &
_TIF_WORK_SYSCALL_ENTRY;

/*
* If TIF_NOHZ is set, we are required to call user_exit() before
* doing anything that could touch RCU.
*/
if (work & _TIF_NOHZ) {
user_exit();
work &= ~TIF_NOHZ;
}

#ifdef CONFIG_SECCOMP
/*
* Do seccomp first -- it should minimize exposure of other
* code, and keeping seccomp fast is probably more valuable
* than the rest of this.
*/
if (work & _TIF_SECCOMP) {
struct seccomp_data sd;

sd.arch = arch;
sd.nr = regs->orig_ax;
sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
sd.args[0] = regs->di;
sd.args[1] = regs->si;
sd.args[2] = regs->dx;
sd.args[3] = regs->r10;
sd.args[4] = regs->r8;
sd.args[5] = regs->r9;
} else
#endif
{
sd.args[0] = regs->bx;
sd.args[1] = regs->cx;
sd.args[2] = regs->dx;
sd.args[3] = regs->si;
sd.args[4] = regs->di;
sd.args[5] = regs->bp;
}

BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);

ret = seccomp_phase1(&sd);
if (ret == SECCOMP_PHASE1_SKIP) {
regs->orig_ax = -1;
ret = 0;
} else if (ret != SECCOMP_PHASE1_OK) {
return ret; /* Go directly to phase 2 */
}

work &= ~_TIF_SECCOMP;
}
#endif

/* Do our best to finish without phase 2. */
if (work == 0)
return ret; /* seccomp and/or nohz only (ret == 0 here) */

#ifdef CONFIG_AUDITSYSCALL
if (work == _TIF_SYSCALL_AUDIT) {
/*
* If there is no more work to be done except auditing,
* then audit in phase 1. Phase 2 always audits, so, if
* we audit here, then we can't go on to phase 2.
*/
do_audit_syscall_entry(regs, arch);
return 0;
}
#endif

return 1; /* Something is enabled that we can't handle in phase 1 */
}

/* Returns the syscall nr to run (which should match regs->orig_ax). */
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
unsigned long phase1_result)
{
long ret = 0;
u32 work = ACCESS_ONCE(current_thread_info()->flags) &
_TIF_WORK_SYSCALL_ENTRY;

user_exit();
BUG_ON(regs != task_pt_regs(current));

/*
* If we stepped into a sysenter/syscall insn, it trapped in
Expand All @@ -1467,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs)
* do_debug() and we need to set it again to restore the user
* state. If we entered on the slow path, TF was already set.
*/
if (test_thread_flag(TIF_SINGLESTEP))
if (work & _TIF_SINGLESTEP)
regs->flags |= X86_EFLAGS_TF;

/* do the secure computing check first */
if (secure_computing(regs->orig_ax)) {
#ifdef CONFIG_SECCOMP
/*
* Call seccomp_phase2 before running the other hooks so that
* they can see any changes made by a seccomp tracer.
*/
if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
/* seccomp failures shouldn't expose any additional code. */
ret = -1L;
goto out;
return -1;
}
#endif

if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
if (unlikely(work & _TIF_SYSCALL_EMU))
ret = -1L;

if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
Expand All @@ -1487,23 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);

if (IS_IA32)
audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_ax,
regs->bx, regs->cx,
regs->dx, regs->si);
#ifdef CONFIG_X86_64
else
audit_syscall_entry(AUDIT_ARCH_X86_64,
regs->orig_ax,
regs->di, regs->si,
regs->dx, regs->r10);
#endif
do_audit_syscall_entry(regs, arch);

out:
return ret ?: regs->orig_ax;
}

long syscall_trace_enter(struct pt_regs *regs)
{
u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);

if (phase1_result == 0)
return regs->orig_ax;
else
return syscall_trace_enter_phase2(regs, arch, phase1_result);
}

void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/vsyscall_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
*/
regs->orig_ax = syscall_nr;
regs->ax = -ENOSYS;
tmp = secure_computing(syscall_nr);
tmp = secure_computing();
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
Expand Down
Loading

0 comments on commit ba1a96f

Please sign in to comment.