riscv: smarter FPU context switching support

Instead of saving/restoring FPU content on every exception and task switch, this replaces FPU sharing support with a "lazy" (on-demand) context switching algorithm similar to the one used on ARM64. Every thread starts with FPU access disabled. On the first access the FPU trap is invoked to: - flush the FPU content to the previous thread's memory storage; - restore the current thread's FPU content from memory. When a thread loads its data in the FPU, it becomes the FPU owner. FPU content is preserved across task switching, however FPU access is either allowed if the new thread is the FPU owner, or denied otherwise. A thread may claim FPU ownership only through the FPU trap. This way, threads that don't use the FPU won't force an FPU context switch. If only one running thread uses the FPU, there will be no FPU context switching to do at all. It is possible to do FP accesses in ISRs and syscalls. This is not the norm though, so the same principle is applied here, although exception contexts may not own the FPU. When they access the FPU, the FPU content is flushed and the exception context is granted FPU access for the duration of the exception. Nested IRQs are disallowed in that case to dispense with the need to save and restore exception's FPU context data. This is the core implementation only to ease reviewing. It is not yet hooked into the build. Signed-off-by: Nicolas Pitre <[email protected]>
jmyllyla · Jan 24, 2023 · cb4c0f6 · cb4c0f6
1 parent 4f9b547
commit cb4c0f6
Show file tree

Hide file tree

Showing 3 changed files with 384 additions and 0 deletions.
diff --git a/arch/riscv/core/fpu.S b/arch/riscv/core/fpu.S
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023 BayLibre SAS
+ * Written by: Nicolas Pitre
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <zephyr/toolchain.h>
+#include <zephyr/linker/sections.h>
+#include <offsets.h>
+
+#ifdef CONFIG_CPU_HAS_FPU_DOUBLE_PRECISION
+#define LOAD	fld
+#define STORE	fsd
+#else
+#define LOAD	flw
+#define STORE	fsw
+#endif
+
+#define DO_FP_REGS(op, ptr) \
+	op fa0,  __z_riscv_fp_context_t_fa0_OFFSET (ptr); \
+	op fa1,  __z_riscv_fp_context_t_fa1_OFFSET (ptr); \
+	op fa2,  __z_riscv_fp_context_t_fa2_OFFSET (ptr); \
+	op fa3,  __z_riscv_fp_context_t_fa3_OFFSET (ptr); \
+	op fa4,  __z_riscv_fp_context_t_fa4_OFFSET (ptr); \
+	op fa5,  __z_riscv_fp_context_t_fa5_OFFSET (ptr); \
+	op fa6,  __z_riscv_fp_context_t_fa6_OFFSET (ptr); \
+	op fa7,  __z_riscv_fp_context_t_fa7_OFFSET (ptr); \
+	op fs0,  __z_riscv_fp_context_t_fs0_OFFSET (ptr); \
+	op fs1,  __z_riscv_fp_context_t_fs1_OFFSET (ptr); \
+	op fs2,  __z_riscv_fp_context_t_fs2_OFFSET (ptr); \
+	op fs3,  __z_riscv_fp_context_t_fs3_OFFSET (ptr); \
+	op fs4,  __z_riscv_fp_context_t_fs4_OFFSET (ptr); \
+	op fs5,  __z_riscv_fp_context_t_fs5_OFFSET (ptr); \
+	op fs6,  __z_riscv_fp_context_t_fs6_OFFSET (ptr); \
+	op fs7,  __z_riscv_fp_context_t_fs7_OFFSET (ptr); \
+	op fs8,  __z_riscv_fp_context_t_fs8_OFFSET (ptr); \
+	op fs9,  __z_riscv_fp_context_t_fs9_OFFSET (ptr); \
+	op fs10, __z_riscv_fp_context_t_fs10_OFFSET(ptr); \
+	op fs11, __z_riscv_fp_context_t_fs11_OFFSET(ptr); \
+	op ft0,  __z_riscv_fp_context_t_ft0_OFFSET (ptr); \
+	op ft1,  __z_riscv_fp_context_t_ft1_OFFSET (ptr); \
+	op ft2,  __z_riscv_fp_context_t_ft2_OFFSET (ptr); \
+	op ft3,  __z_riscv_fp_context_t_ft3_OFFSET (ptr); \
+	op ft4,  __z_riscv_fp_context_t_ft4_OFFSET (ptr); \
+	op ft5,  __z_riscv_fp_context_t_ft5_OFFSET (ptr); \
+	op ft6,  __z_riscv_fp_context_t_ft6_OFFSET (ptr); \
+	op ft7,  __z_riscv_fp_context_t_ft7_OFFSET (ptr); \
+	op ft8,  __z_riscv_fp_context_t_ft8_OFFSET (ptr); \
+	op ft9,  __z_riscv_fp_context_t_ft9_OFFSET (ptr); \
+	op ft10, __z_riscv_fp_context_t_ft10_OFFSET(ptr); \
+	op ft11, __z_riscv_fp_context_t_ft11_OFFSET(ptr)
+
+GTEXT(z_riscv_fpu_save)
+SECTION_FUNC(TEXT, z_riscv_fpu_save)
+
+	frcsr t0
+	DO_FP_REGS(STORE, a0)
+	sw t0, __z_riscv_fp_context_t_fcsr_OFFSET(a0)
+	ret
+
+GTEXT(z_riscv_fpu_restore)
+SECTION_FUNC(TEXT, z_riscv_fpu_restore)
+
+	DO_FP_REGS(LOAD, a0)
+	lw t0, __z_riscv_fp_context_t_fcsr_OFFSET(a0)
+	fscsr t0
+	ret
+
diff --git a/arch/riscv/core/fpu.c b/arch/riscv/core/fpu.c
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2023 BayLibre SAS
+ * Written by: Nicolas Pitre
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <zephyr/kernel.h>
+#include <zephyr/kernel_structs.h>
+#include <kernel_arch_interface.h>
+#include <zephyr/sys/atomic.h>
+
+/* to be found in fpu.S */
+extern void z_riscv_fpu_save(struct z_riscv_fp_context *saved_fp_context);
+extern void z_riscv_fpu_restore(struct z_riscv_fp_context *saved_fp_context);
+
+#define FPU_DEBUG 0
+
+#if FPU_DEBUG
+
+/*
+ * Debug traces have to be produced without printk() or any other functions
+ * using a va_list as va_start() may copy the FPU registers that could be
+ * used to pass float arguments, and that would trigger an FPU access trap.
+ * Note: Apparently gcc doesn't use float regs with variadic functions on
+ * RISC-V even if -mabi is used with f or d so this precaution might be
+ * unnecessary. But better be safe than sorry especially for debugging code.
+ */
+
+#include <string.h>
+
+static void DBG(char *msg, struct k_thread *th)
+{
+	char buf[80], *p;
+	unsigned int v;
+
+	strcpy(buf, "CPU# exc# ");
+	buf[3] = '0' + _current_cpu->id;
+	buf[8] = '0' + _current->arch.exception_depth;
+	strcat(buf, _current->name);
+	strcat(buf, ": ");
+	strcat(buf, msg);
+	strcat(buf, " ");
+	strcat(buf, th->name);
+
+	v = *(unsigned char *)&th->arch.saved_fp_context;
+	p = buf + strlen(buf);
+	*p++ = ' ';
+	*p++ = ((v >> 4) < 10) ? ((v >> 4) + '0') : ((v >> 4) - 10 + 'a');
+	*p++ = ((v & 15) < 10) ? ((v & 15) + '0') : ((v & 15) - 10 + 'a');
+	*p++ = '\n';
+	*p = 0;
+
+	k_str_out(buf, p - buf);
+}
+
+#else
+
+static inline void DBG(char *msg, struct k_thread *t) { }
+
+#endif /* FPU_DEBUG */
+
+static void z_riscv_fpu_disable(void)
+{
+	unsigned long status = csr_read(mstatus);
+
+	__ASSERT((status & MSTATUS_IEN) == 0, "must be called with IRQs disabled");
+
+	if ((status & MSTATUS_FS) != 0) {
+		csr_clear(mstatus, MSTATUS_FS);
+
+		/* remember its clean/dirty state */
+		_current_cpu->arch.fpu_state = (status & MSTATUS_FS);
+	}
+}
+
+/*
+ * Flush FPU content and clear ownership. If the saved FPU state is "clean"
+ * then we know the in-memory copy is up to date and skip the FPU content
+ * transfer. The saved FPU state is updated upon disabling FPU access so
+ * we require that this be called only when the FPU is disabled.
+ *
+ * This is called locally and also from flush_fpu_ipi_handler().
+ */
+void z_riscv_flush_local_fpu(void)
+{
+	__ASSERT((csr_read(mstatus) & MSTATUS_IEN) == 0,
+		 "must be called with IRQs disabled");
+	__ASSERT((csr_read(mstatus) & MSTATUS_FS) == 0,
+		 "must be called with FPU access disabled");
+
+	struct k_thread *owner = atomic_ptr_get(&_current_cpu->arch.fpu_owner);
+
+	if (owner != NULL) {
+		bool dirty = (_current_cpu->arch.fpu_state == MSTATUS_FS_DIRTY);
+
+		if (dirty) {
+			/* turn on FPU access */
+			csr_set(mstatus, MSTATUS_FS_CLEAN);
+			/* save current owner's content */
+			z_riscv_fpu_save(&owner->arch.saved_fp_context);
+		}
+
+		/* disable FPU access */
+		csr_clear(mstatus, MSTATUS_FS);
+
+		/* release ownership */
+		atomic_ptr_clear(&_current_cpu->arch.fpu_owner);
+		DBG("disable", owner);
+	}
+}
+
+#ifdef CONFIG_SMP
+static void flush_owned_fpu(struct k_thread *thread)
+{
+	__ASSERT((csr_read(mstatus) & MSTATUS_IEN) == 0,
+		 "must be called with IRQs disabled");
+
+	int i;
+	atomic_ptr_val_t owner;
+
+	/* search all CPUs for the owner we want */
+	unsigned int num_cpus = arch_num_cpus();
+
+	for (i = 0; i < num_cpus; i++) {
+		owner = atomic_ptr_get(&_kernel.cpus[i].arch.fpu_owner);
+		if (owner != thread) {
+			continue;
+		}
+		/* we found it live on CPU i */
+		if (i == _current_cpu->id) {
+			z_riscv_fpu_disable();
+			z_riscv_flush_local_fpu();
+			break;
+		}
+		/* the FPU context is live on another CPU */
+		z_riscv_flush_fpu_ipi(i);
+
+		/*
+		 * Wait for it only if this is about the thread
+		 * currently running on this CPU. Otherwise the
+		 * other CPU running some other thread could regain
+		 * ownership the moment it is removed from it and
+		 * we would be stuck here.
+		 *
+		 * Also, if this is for the thread running on this
+		 * CPU, then we preemptively flush any live context
+		 * on this CPU as well since we're likely to
+		 * replace it, and this avoids a deadlock where
+		 * two CPUs want to pull each other's FPU context.
+		 */
+		if (thread == _current) {
+			z_riscv_fpu_disable();
+			z_riscv_flush_local_fpu();
+			do {
+				arch_nop();
+				owner = atomic_ptr_get(&_kernel.cpus[i].arch.fpu_owner);
+			} while (owner == thread);
+		}
+		break;
+	}
+}
+#endif
+
+void z_riscv_fpu_enter_exc(void)
+{
+	/* always deny FPU access whenever an exception is entered */
+	z_riscv_fpu_disable();
+}
+
+/*
+ * Process the FPU trap.
+ *
+ * This usually means that FP regs belong to another thread. Save them
+ * to that thread's save area and restore the current thread's content.
+ *
+ * We also get here when FP regs are used while in exception as FP access
+ * is always disabled by default in that case. If so we save the FPU content
+ * to the owning thread and simply enable FPU access. Exceptions should be
+ * short and don't have persistent register contexts when they're done so
+ * there is nothing to save/restore for that context... as long as we
+ * don't get interrupted that is. To ensure that we mask interrupts to
+ * the triggering exception context.
+ *
+ * Note that the exception depth count was not incremented before this call
+ * as no further exceptions are expected before returning to normal mode.
+ */
+void z_riscv_fpu_trap(z_arch_esf_t *esf)
+{
+	__ASSERT((esf->mstatus & MSTATUS_FS) == 0 &&
+		 (csr_read(mstatus) & MSTATUS_FS) == 0,
+		 "called despite FPU being accessible");
+
+	/* save current owner's content  if any */
+	z_riscv_flush_local_fpu();
+
+	if (_current->arch.exception_depth > 0) {
+		/*
+		 * We were already in exception when the FPU access trapped.
+		 * We give it access and prevent any further IRQ recursion
+		 * by disabling IRQs as we wouldn't be able to preserve the
+		 * interrupted exception's FPU context.
+		 */
+		esf->mstatus &= ~MSTATUS_IEN;
+
+		/* make it accessible to the returning context */
+		esf->mstatus |= MSTATUS_FS_INIT;
+
+		return;
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * Make sure the FPU context we need isn't live on another CPU.
+	 * The current CPU's FPU context is NULL at this point.
+	 */
+	flush_owned_fpu(_current);
+#endif
+
+	/* become new owner */
+	atomic_ptr_set(&_current_cpu->arch.fpu_owner, _current);
+
+	/* make it accessible and clean to the returning context */
+	esf->mstatus |= MSTATUS_FS_CLEAN;
+
+	/* restore our content */
+	csr_set(mstatus, MSTATUS_FS_INIT);
+	z_riscv_fpu_restore(&_current->arch.saved_fp_context);
+	DBG("restore", _current);
+}
+
+/*
+ * Perform lazy FPU context switching by simply granting or denying
+ * access to FP regs based on FPU ownership before leaving the last
+ * exception level in case of exceptions, or during a thread context
+ * switch with the exception level of the new thread being 0.
+ * If current thread doesn't own the FP regs then it will trap on its
+ * first access and then the actual FPU context switching will occur.
+ */
+static bool fpu_access_allowed(unsigned int exc_update_level)
+{
+	__ASSERT((csr_read(mstatus) & MSTATUS_IEN) == 0,
+		 "must be called with IRQs disabled");
+
+	if (_current->arch.exception_depth == exc_update_level) {
+		/* We're about to execute non-exception code */
+		return (_current_cpu->arch.fpu_owner == _current);
+	}
+	/*
+	 * Any new exception level should always trap on FPU
+	 * access as we want to make sure IRQs are disabled before
+	 * granting it access (see z_riscv_fpu_trap() documentation).
+	 */
+	return false;
+}
+
+/*
+ * This is called on every exception exit except for z_riscv_fpu_trap().
+ * In that case the exception level of interest is 1 (soon to be 0).
+ */
+void z_riscv_fpu_exit_exc(z_arch_esf_t *esf)
+{
+	if (fpu_access_allowed(1)) {
+		esf->mstatus |= _current_cpu->arch.fpu_state;
+	} else {
+		esf->mstatus &= ~MSTATUS_FS;
+	}
+}
+
+/*
+ * This is called from z_riscv_context_switch(). FPU access may be granted
+ * only if exception level is 0. If we switch to a thread that is still in
+ * some exception context then FPU access would be re-evaluated at exception
+ * exit time via z_riscv_fpu_exit_exc().
+ */
+void z_riscv_fpu_thread_context_switch(void)
+{
+	if (fpu_access_allowed(0)) {
+		csr_clear(mstatus, MSTATUS_FS);
+		csr_set(mstatus, _current_cpu->arch.fpu_state);
+	} else {
+		z_riscv_fpu_disable();
+	}
+}
+
+int arch_float_disable(struct k_thread *thread)
+{
+	if (thread != NULL) {
+		unsigned int key = arch_irq_lock();
+
+#ifdef CONFIG_SMP
+		flush_owned_fpu(thread);
+#else
+		if (thread == _current_cpu->arch.fpu_owner) {
+			z_riscv_fpu_disable();
+			z_riscv_flush_local_fpu();
+		}
+#endif
+
+		arch_irq_unlock(key);
+	}
+
+	return 0;
+}
+
+int arch_float_enable(struct k_thread *thread, unsigned int options)
+{
+	/* floats always gets enabled automatically at the moment */
+	return 0;
+}
diff --git a/arch/riscv/include/kernel_arch_func.h b/arch/riscv/include/kernel_arch_func.h
@@ -80,6 +80,11 @@ extern FUNC_NORETURN void z_riscv_userspace_enter(k_thread_entry_t user_entry,
 int z_irq_do_offload(void);
 #endif
 
+#ifdef CONFIG_FPU_SHARING
+void z_riscv_flush_local_fpu(void);
+void z_riscv_flush_fpu_ipi(unsigned int cpu);
+#endif
+
 #endif /* _ASMLANGUAGE */
 
 #ifdef __cplusplus