forked from BachiLi/diffvg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
atomic.h
139 lines (126 loc) · 3.88 KB
/
atomic.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#pragma once
#include "diffvg.h"
#include "vector.h"
#include "matrix.h"
// https://stackoverflow.com/questions/39274472/error-function-atomicadddouble-double-has-already-been-defined
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static inline DEVICE double atomicAdd(double *address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
if (val == 0.0)
return __longlong_as_double(old);
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
#ifndef WIN32
template <typename T0, typename T1>
DEVICE
inline T0 atomic_add_(T0 &target, T1 source) {
#ifdef __CUDA_ARCH__
return atomicAdd(&target, (T0)source);
#else
T0 old_val;
T0 new_val;
do {
old_val = target;
new_val = old_val + source;
} while (!__atomic_compare_exchange(&target, &old_val, &new_val, true,
std::memory_order::memory_order_seq_cst,
std::memory_order::memory_order_seq_cst));
return old_val;
#endif
}
DEVICE
inline
float atomic_add(float &target, float source) {
return atomic_add_(target, source);
}
DEVICE
inline
double atomic_add(double &target, double source) {
return atomic_add_(target, source);
}
#else
float win_atomic_add(float &target, float source);
double win_atomic_add(double &target, double source);
DEVICE
static float atomic_add(float &target, float source) {
#ifdef __CUDA_ARCH__
return atomicAdd(&target, source);
#else
return win_atomic_add(target, source);
#endif
}
DEVICE
static double atomic_add(double &target, double source) {
#ifdef __CUDA_ARCH__
return atomicAdd(&target, (double)source);
#else
return win_atomic_add(target, source);
#endif
}
#endif
template <typename T0, typename T1>
DEVICE
inline T0 atomic_add(T0 *target, T1 source) {
return atomic_add(*target, (T0)source);
}
template <typename T0, typename T1>
DEVICE
inline TVector2<T0> atomic_add(TVector2<T0> &target, const TVector2<T1> &source) {
atomic_add(target[0], source[0]);
atomic_add(target[1], source[1]);
return target;
}
template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TVector2<T1> &source) {
atomic_add(target[0], (T0)source[0]);
atomic_add(target[1], (T0)source[1]);
}
template <typename T0, typename T1>
DEVICE
inline TVector3<T0> atomic_add(TVector3<T0> &target, const TVector3<T1> &source) {
atomic_add(target[0], source[0]);
atomic_add(target[1], source[1]);
atomic_add(target[2], source[2]);
return target;
}
template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TVector3<T1> &source) {
atomic_add(target[0], (T0)source[0]);
atomic_add(target[1], (T0)source[1]);
atomic_add(target[2], (T0)source[2]);
}
template <typename T0, typename T1>
DEVICE
inline TVector4<T0> atomic_add(TVector4<T0> &target, const TVector4<T1> &source) {
atomic_add(target[0], source[0]);
atomic_add(target[1], source[1]);
atomic_add(target[2], source[2]);
atomic_add(target[3], source[3]);
return target;
}
template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TVector4<T1> &source) {
atomic_add(target[0], (T0)source[0]);
atomic_add(target[1], (T0)source[1]);
atomic_add(target[2], (T0)source[2]);
atomic_add(target[3], (T0)source[3]);
}
template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TMatrix3x3<T1> &source) {
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
atomic_add(target[3 * i + j], (T0)source(i, j));
}
}
}