From 084c7007804b08d13acc90bf3145c5ac82d1c8a1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 8 Oct 2018 14:33:05 -0700 Subject: [PATCH] core: support cgroup v2 device controller Cgroup v2 provides the eBPF-based device controller, which isn't currently supported by systemd. This commit aims to provide such support. There are no user-visible changes, just the device policy and whitelist start working if cgroup v2 is used. --- src/basic/cgroup-util.c | 1 + src/basic/cgroup-util.h | 2 + src/core/bpf-devices.c | 247 ++++++++++++++++++++++++++++++++++++ src/core/bpf-devices.h | 16 +++ src/core/cgroup.c | 125 ++++++++++++------ src/core/meson.build | 2 + src/core/unit.c | 2 + src/core/unit.h | 3 + src/test/test-cgroup-mask.c | 2 +- 9 files changed, 360 insertions(+), 40 deletions(-) create mode 100644 src/core/bpf-devices.c create mode 100644 src/core/bpf-devices.h diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 6c38e590312a8..7728b63a5f299 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2768,6 +2768,7 @@ static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = { [CGROUP_CONTROLLER_DEVICES] = "devices", [CGROUP_CONTROLLER_PIDS] = "pids", [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall", + [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices", }; DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController); diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index 23602b6da9523..73b431a27a562 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -30,6 +30,7 @@ typedef enum CGroupController { /* BPF-based pseudo-controllers, v2 only */ CGROUP_CONTROLLER_BPF_FIREWALL, + CGROUP_CONTROLLER_BPF_DEVICES, _CGROUP_CONTROLLER_MAX, _CGROUP_CONTROLLER_INVALID = -1, @@ -47,6 +48,7 @@ typedef enum CGroupMask { CGROUP_MASK_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_DEVICES), CGROUP_MASK_PIDS = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_PIDS), CGROUP_MASK_BPF_FIREWALL = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_FIREWALL), + CGROUP_MASK_BPF_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_DEVICES), _CGROUP_MASK_ALL = CGROUP_CONTROLLER_TO_MASK(_CGROUP_CONTROLLER_MAX) - 1 } CGroupMask; diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c new file mode 100644 index 0000000000000..1a2153122aa29 --- /dev/null +++ b/src/core/bpf-devices.c @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#include + +#include "bpf-devices.h" +#include "bpf-program.h" + +#define PASS_JUMP_OFF 4096 + +static int bpf_access_type(const char *acc) { + int r = 0; + + assert(acc); + + for (; *acc; acc++) + switch(*acc) { + case 'r': + r |= BPF_DEVCG_ACC_READ; + break; + case 'w': + r |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + r |= BPF_DEVCG_ACC_MKNOD; + break; + default: + return -EINVAL; + } + + return r; +} + +int cgroup_bpf_whitelist_device(BPFProgram *prog, int type, int major, int minor, const char *acc) { + struct bpf_insn insn[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 6), /* compare device type */ + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */ + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + int r, access; + + assert(prog); + assert(acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + insn[2].imm = access; + + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int cgroup_bpf_whitelist_major(BPFProgram *prog, int type, int major, const char *acc) { + struct bpf_insn insn[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 5), /* compare device type */ + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */ + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + int r, access; + + assert(prog); + assert(acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + insn[2].imm = access; + + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist) { + struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + int r; + + assert(ret); + + if (policy == CGROUP_AUTO && !whitelist) + return 0; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &prog); + if (r < 0) + return log_error_errno(r, "Loading device control BPF program failed: %m"); + + if (policy == CGROUP_CLOSED || whitelist) { + r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + } + + *ret = TAKE_PTR(prog); + + return 0; +} + +int cgroup_apply_device_bpf(Unit *u, BPFProgram *prog, CGroupDevicePolicy policy, bool whitelist) { + struct bpf_insn post_insn[] = { + /* return DENY */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + + }; + + struct bpf_insn exit_insn[] = { + /* else return ALLOW */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_free_ char *path = NULL; + uint32_t flags; + int r; + + if (!prog) { + /* Remove existing program. */ + u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed); + return 0; + } + + if (policy != CGROUP_STRICT || whitelist) { + size_t off; + + r = bpf_program_add_instructions(prog, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + /* Fixup PASS_JUMP_OFF jump offsets. */ + for (off = 0; off < prog->n_instructions; off++) { + struct bpf_insn *ins = &prog->instructions[off]; + + if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF) + ins->off = prog->n_instructions - off - 1; + } + } else + /* Explicitly forbid everything. */ + exit_insn[0].imm = 0; + + r = bpf_program_add_instructions(prog, exit_insn, ELEMENTSOF(exit_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup path: %m"); + + flags = (u->type == UNIT_SLICE || unit_cgroup_delegate(u)) ? BPF_F_ALLOW_MULTI : 0; + + /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */ + u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed); + + r = bpf_program_cgroup_attach(prog, BPF_CGROUP_DEVICE, path, flags); + if (r < 0) + return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m", path); + + /* Remember that this BPF program is installed now. */ + u->bpf_device_control_installed = bpf_program_ref(prog); + + return 0; +} + +int bpf_devices_supported(void) { + struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL; + static int supported = -1; + int r; + + /* Checks whether BPF device controller is supported. For this, we check five things: + * + * a) whether we are privileged + * b) whether the unified hierarchy is being used + * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require + */ + + if (supported >= 0) + return supported; + + if (geteuid() != 0) { + log_debug("Not enough privileges, BPF device control is not supported."); + return supported = 0; + } + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + log_debug("Not running with unified cgroups, BPF device control is not supported."); + return supported = 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &program); + if (r < 0) { + log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + return supported; +} diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h new file mode 100644 index 0000000000000..f9a6eec028dd4 --- /dev/null +++ b/src/core/bpf-devices.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include + +#include "unit.h" + +struct BPFProgram; + +int bpf_devices_supported(void); + +int cgroup_bpf_whitelist_device(BPFProgram *p, int type, int major, int minor, const char *acc); +int cgroup_bpf_whitelist_major(BPFProgram *p, int type, int major, const char *acc); + +int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist); +int cgroup_apply_device_bpf(Unit *u, BPFProgram *p, CGroupDevicePolicy policy, bool whitelist); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index c390e696817a2..a34df2538fae1 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -7,6 +7,7 @@ #include "blockdev-util.h" #include "bpf-firewall.h" #include "btrfs-util.h" +#include "bpf-devices.h" #include "bus-error.h" #include "cgroup-util.h" #include "cgroup.h" @@ -386,8 +387,7 @@ static int lookup_block_device(const char *p, dev_t *ret) { return 0; } -static int whitelist_device(const char *path, const char *node, const char *acc) { - char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4]; +static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) { struct stat st; bool ignore_notfound; int r; @@ -414,23 +414,34 @@ static int whitelist_device(const char *path, const char *node, const char *acc) return -ENODEV; } - sprintf(buf, - "%c %u:%u %s", - S_ISCHR(st.st_mode) ? 'c' : 'b', - major(st.st_rdev), minor(st.st_rdev), - acc); + if (cg_all_unified() > 0) { + if (!prog) + return 0; - r = cg_set_attribute("devices", path, "devices.allow", buf); - if (r < 0) - log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set devices.allow on %s: %m", path); + cgroup_bpf_whitelist_device(prog, S_ISCHR(st.st_mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, + major(st.st_rdev), minor(st.st_rdev), acc); + } else { + char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4]; + + sprintf(buf, + "%c %u:%u %s", + S_ISCHR(st.st_mode) ? 'c' : 'b', + major(st.st_rdev), minor(st.st_rdev), + acc); + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set devices.allow on %s: %m", path); + } return r; } -static int whitelist_major(const char *path, const char *name, char type, const char *acc) { +static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) { _cleanup_fclose_ FILE *f = NULL; char line[LINE_MAX]; + char *p, *w; bool good = false; int r; @@ -443,7 +454,6 @@ static int whitelist_major(const char *path, const char *name, char type, const return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type); FOREACH_LINE(line, f, goto fail) { - char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w; unsigned maj; truncate_nl(line); @@ -485,16 +495,27 @@ static int whitelist_major(const char *path, const char *name, char type, const if (fnmatch(name, w, 0) != 0) continue; - sprintf(buf, - "%c %u:* %s", - type, - maj, - acc); + if (cg_all_unified() > 0) { + if (!prog) + continue; - r = cg_set_attribute("devices", path, "devices.allow", buf); - if (r < 0) - log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to set devices.allow on %s: %m", path); + cgroup_bpf_whitelist_major(prog, + type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, + maj, acc); + } else { + char buf[2+DECIMAL_STR_MAX(unsigned)+3+4]; + + sprintf(buf, + "%c %u:* %s", + type, + maj, + acc); + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set devices.allow on %s: %m", path); + } } return 0; @@ -1019,20 +1040,27 @@ static void cgroup_context_apply( } } - if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) { + if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && !is_root) { + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; CGroupDeviceAllow *a; - /* Changing the devices list of a populated cgroup - * might result in EINVAL, hence ignore EINVAL - * here. */ + if (cg_all_unified() > 0) { + r = cgroup_init_device_bpf(&prog, c->device_policy, c->device_allow); + if (r < 0) + log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m"); + } else { + /* Changing the devices list of a populated cgroup + * might result in EINVAL, hence ignore EINVAL + * here. */ - if (c->device_allow || c->device_policy != CGROUP_AUTO) - r = cg_set_attribute("devices", path, "devices.deny", "a"); - else - r = cg_set_attribute("devices", path, "devices.allow", "a"); - if (r < 0) - log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, - "Failed to reset devices.list: %m"); + if (c->device_allow || c->device_policy != CGROUP_AUTO) + r = cg_set_attribute("devices", path, "devices.deny", "a"); + else + r = cg_set_attribute("devices", path, "devices.allow", "a"); + if (r < 0) + log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to reset devices.list: %m"); + } if (c->device_policy == CGROUP_CLOSED || (c->device_policy == CGROUP_AUTO && c->device_allow)) { @@ -1051,10 +1079,10 @@ static void cgroup_context_apply( const char *x, *y; NULSTR_FOREACH_PAIR(x, y, auto_devices) - whitelist_device(path, x, y); + whitelist_device(prog, path, x, y); /* PTS (/dev/pts) devices may not be duplicated, but accessed */ - whitelist_major(path, "pts", 'c', "rw"); + whitelist_major(prog, path, "pts", 'c', "rw"); } LIST_FOREACH(device_allow, a, c->device_allow) { @@ -1074,14 +1102,26 @@ static void cgroup_context_apply( acc[k++] = 0; if (path_startswith(a->path, "/dev/")) - whitelist_device(path, a->path, acc); + whitelist_device(prog, path, a->path, acc); else if ((val = startswith(a->path, "block-"))) - whitelist_major(path, val, 'b', acc); + whitelist_major(prog, path, val, 'b', acc); else if ((val = startswith(a->path, "char-"))) - whitelist_major(path, val, 'c', acc); + whitelist_major(prog, path, val, 'c', acc); else log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path); } + + r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow); + if (r < 0) { + static bool warned = false; + + log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r, + "Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n" + "Proceeding WITHOUT applying ACL (all devices will be accessible)!\n" + "(This warning is only shown for the first loaded unit using device ACL.)", u->id); + + warned = true; + } } if (apply_mask & CGROUP_MASK_PIDS) { @@ -1151,7 +1191,7 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) { if (c->device_allow || c->device_policy != CGROUP_AUTO) - mask |= CGROUP_MASK_DEVICES; + mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES; if (c->tasks_accounting || c->tasks_max != CGROUP_LIMIT_MAX) @@ -1942,6 +1982,8 @@ void unit_prune_cgroup(Unit *u) { u->cgroup_realized = false; u->cgroup_realized_mask = 0; u->cgroup_enabled_mask = 0; + + u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed); } int unit_search_main_pid(Unit *u, pid_t *ret) { @@ -2212,6 +2254,11 @@ static int cg_bpf_mask_supported(CGroupMask *ret) { if (r > 0) mask |= CGROUP_MASK_BPF_FIREWALL; + /* BPF-based device access control */ + r = bpf_devices_supported(); + if (r > 0) + mask |= CGROUP_MASK_BPF_DEVICES; + *ret = mask; return 0; } diff --git a/src/core/meson.build b/src/core/meson.build index 3852c5e9d8977..450d6f72a9291 100644 --- a/src/core/meson.build +++ b/src/core/meson.build @@ -5,6 +5,8 @@ libcore_la_sources = ''' audit-fd.h automount.c automount.h + bpf-devices.c + bpf-devices.h bpf-firewall.c bpf-firewall.h cgroup.c diff --git a/src/core/unit.c b/src/core/unit.c index 663df3e3c5988..783643fb618ad 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -666,6 +666,8 @@ void unit_free(Unit *u) { bpf_program_unref(u->ip_bpf_egress); bpf_program_unref(u->ip_bpf_egress_installed); + bpf_program_unref(u->bpf_device_control_installed); + condition_free_list(u->conditions); condition_free_list(u->asserts); diff --git a/src/core/unit.h b/src/core/unit.h index 5a97d9f27c01d..dbd33d1c5b23d 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -257,6 +257,9 @@ typedef struct Unit { CGroupMask cgroup_members_mask; int cgroup_inotify_wd; + /* Device Controller BPF program */ + BPFProgram *bpf_device_control_installed; + /* IP BPF Firewalling/accounting */ int ip_accounting_ingress_map_fd; int ip_accounting_egress_map_fd; diff --git a/src/test/test-cgroup-mask.c b/src/test/test-cgroup-mask.c index c4df3253653d2..d9223f5f61f25 100644 --- a/src/test/test-cgroup-mask.c +++ b/src/test/test-cgroup-mask.c @@ -100,7 +100,7 @@ static void test_cg_mask_to_string_one(CGroupMask mask, const char *t) { static void test_cg_mask_to_string(void) { test_cg_mask_to_string_one(0, NULL); - test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids bpf-firewall"); + test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids bpf-firewall bpf-devices"); test_cg_mask_to_string_one(CGROUP_MASK_CPU, "cpu"); test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT, "cpuacct"); test_cg_mask_to_string_one(CGROUP_MASK_IO, "io");