Skip to content

Commit fbc921a

Browse files
invertegoLukeUsher
authored andcommitted
n64: replace cmath functions with arch-specific intrinsics
We cannot rely on specific status flags being set by C standard math functions.
1 parent 6bff023 commit fbc921a

File tree

4 files changed

+179
-67
lines changed

4 files changed

+179
-67
lines changed

ares/n64/cpu/algorithms.cpp

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
template <typename T>
2+
auto CPU::roundNearest(f32 f) -> T {
3+
#if defined(ARCHITECTURE_ARM64)
4+
u32 rnd = fenv.getRound();
5+
fenv.setRound(float_env::toNearest);
6+
T d = vrndns_f32(f);
7+
fenv.setRound(rnd);
8+
return d;
9+
#elif defined(ARCHITECTURE_AMD64)
10+
__m128 t = _mm_set_ss(f);
11+
t = _mm_round_ss(t, t, _MM_FROUND_TO_NEAREST_INT);
12+
return _mm_cvtss_f32(t);
13+
#else
14+
return lround(f);
15+
#endif
16+
}
17+
18+
template <typename T>
19+
auto CPU::roundNearest(f64 f) -> T {
20+
#if defined(ARCHITECTURE_ARM64)
21+
u32 rnd = fenv.getRound();
22+
fenv.setRound(float_env::toNearest);
23+
float64x1_t vf = {f};
24+
T d = vrndn_f64(vf)[0];
25+
fenv.setRound(rnd);
26+
return d;
27+
#elif defined(ARCHITECTURE_AMD64)
28+
__m128d t = _mm_set_sd(f);
29+
t = _mm_round_sd(t, t, _MM_FROUND_TO_NEAREST_INT);
30+
return _mm_cvtsd_f64(t);
31+
#else
32+
return llround(f);
33+
#endif
34+
}
35+
36+
template <typename T>
37+
auto CPU::roundCeil(f32 f) -> T {
38+
#if defined(ARCHITECTURE_AMD64)
39+
__m128 t = _mm_set_ss(f);
40+
t = _mm_round_ss(t, t, _MM_FROUND_TO_POS_INF);
41+
return _mm_cvtss_f32(t);
42+
#else
43+
return ceil(f);
44+
#endif
45+
}
46+
47+
template <typename T>
48+
auto CPU::roundCeil(f64 f) -> T {
49+
#if defined(ARCHITECTURE_AMD64)
50+
__m128d t = _mm_set_sd(f);
51+
t = _mm_round_sd(t, t, _MM_FROUND_TO_POS_INF);
52+
return _mm_cvtsd_f64(t);
53+
#else
54+
return ceil(f);
55+
#endif
56+
}
57+
58+
template<typename T>
59+
auto CPU::roundCurrent(f32 f) -> T {
60+
#if defined(ARCHITECTURE_AMD64)
61+
auto t = _mm_set_ss(f);
62+
t = _mm_round_ss(t, t, _MM_FROUND_CUR_DIRECTION);
63+
return _mm_cvtss_f32(t);
64+
#else
65+
return lrint(f);
66+
#endif
67+
}
68+
69+
template<typename T>
70+
auto CPU::roundCurrent(f64 f) -> T {
71+
#if defined(ARCHITECTURE_AMD64)
72+
auto t = _mm_set_sd(f);
73+
t = _mm_round_sd(t, t, _MM_FROUND_CUR_DIRECTION);
74+
return _mm_cvtsd_f64(t);
75+
#else
76+
return llrint(f);
77+
#endif
78+
}
79+
80+
template <typename T>
81+
auto CPU::roundFloor(f32 f) -> T {
82+
#if defined(ARCHITECTURE_AMD64)
83+
__m128 t = _mm_set_ss(f);
84+
t = _mm_round_ss(t, t, _MM_FROUND_TO_NEG_INF);
85+
return _mm_cvtss_f32(t);
86+
#else
87+
return floor(f);
88+
#endif
89+
}
90+
91+
template <typename T>
92+
auto CPU::roundFloor(f64 f) -> T {
93+
#if defined(ARCHITECTURE_AMD64)
94+
__m128d t = _mm_set_sd(f);
95+
t = _mm_round_sd(t, t, _MM_FROUND_TO_NEG_INF);
96+
return _mm_cvtsd_f64(t);
97+
#else
98+
return floor(f);
99+
#endif
100+
}
101+
102+
template <typename T>
103+
auto CPU::roundTrunc(f32 f) -> T {
104+
#if defined(ARCHITECTURE_AMD64)
105+
__m128 t = _mm_set_ss(f);
106+
t = _mm_round_ss(t, t, _MM_FROUND_TO_ZERO);
107+
return _mm_cvtss_f32(t);
108+
#else
109+
return trunc(f);
110+
#endif
111+
}
112+
113+
template <typename T>
114+
auto CPU::roundTrunc(f64 f) -> T {
115+
#if defined(ARCHITECTURE_AMD64)
116+
__m128d t = _mm_set_sd(f);
117+
t = _mm_round_sd(t, t, _MM_FROUND_TO_ZERO);
118+
return _mm_cvtsd_f64(t);
119+
#else
120+
return trunc(f);
121+
#endif
122+
}
123+
124+
auto CPU::squareRoot(f32 f) -> f32 {
125+
#if defined(ARCHITECTURE_AMD64)
126+
__m128 t = _mm_set_ss(f);
127+
t = _mm_sqrt_ss(t);
128+
return _mm_cvtss_f32(t);
129+
#else
130+
return sqrt(f);
131+
#endif
132+
}
133+
134+
auto CPU::squareRoot(f64 f) -> f64 {
135+
#if defined(ARCHITECTURE_AMD64)
136+
__m128d t = _mm_set_sd(f);
137+
t = _mm_sqrt_sd(t, t);
138+
return _mm_cvtsd_f64(t);
139+
#else
140+
return sqrt(f);
141+
#endif
142+
}

ares/n64/cpu/cpu.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ CPU cpu;
88
#include "tlb.cpp"
99
#include "memory.cpp"
1010
#include "exceptions.cpp"
11+
#include "algorithms.cpp"
1112
#include "interpreter.cpp"
1213
#include "interpreter-ipu.cpp"
1314
#include "interpreter-scc.cpp"

ares/n64/cpu/cpu.hpp

+14-2
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,20 @@ struct CPU : Thread {
344344
u64 pc; //program counter
345345
} ipu;
346346

347+
//algorithms.cpp
348+
template<typename T> auto roundNearest(f32 f) -> T;
349+
template<typename T> auto roundNearest(f64 f) -> T;
350+
template<typename T> auto roundCeil(f32 f) -> T;
351+
template<typename T> auto roundCeil(f64 f) -> T;
352+
template<typename T> auto roundCurrent(f32 f) -> T;
353+
template<typename T> auto roundCurrent(f64 f) -> T;
354+
template<typename T> auto roundFloor(f32 f) -> T;
355+
template<typename T> auto roundFloor(f64 f) -> T;
356+
template<typename T> auto roundTrunc(f32 f) -> T;
357+
template<typename T> auto roundTrunc(f64 f) -> T;
358+
auto squareRoot(f32 f) -> f32;
359+
auto squareRoot(f64 f) -> f64;
360+
347361
//interpreter-ipu.cpp
348362
auto ADD(r64& rd, cr64& rs, cr64& rt) -> void;
349363
auto ADDI(r64& rt, cr64& rs, s16 imm) -> void;
@@ -657,8 +671,6 @@ struct CPU : Thread {
657671
auto fpuClearCause() -> void;
658672
template<typename DST, typename SF>
659673
auto fpuCheckInputConv(SF& f) -> bool;
660-
template <typename T> auto roundeven(f32 f) -> T;
661-
template <typename T> auto roundeven(f64 f) -> T;
662674

663675
auto BC1(bool value, bool likely, s16 imm) -> void;
664676
auto CFC1(r64& rt, u8 rd) -> void;

0 commit comments

Comments
 (0)