-
Notifications
You must be signed in to change notification settings - Fork 105
/
kernel.cu
133 lines (133 loc) · 4.3 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// Inner loop for prog_seed 600
__device__ __forceinline__ void progPowLoop(const uint32_t loop,
uint32_t mix[PROGPOW_REGS],
const dag_t *g_dag,
const uint32_t c_dag[PROGPOW_CACHE_WORDS],
const bool hack_false)
{
dag_t data_dag;
uint32_t offset, data;
const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1);
// global load
offset = __shfl_sync(0xFFFFFFFF, mix[0], loop%PROGPOW_LANES, PROGPOW_LANES);
offset %= PROGPOW_DAG_ELEMENTS;
offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;
data_dag = g_dag[offset];
// hack to prevent compiler from reordering LD and usage
if (hack_false) __threadfence_block();
// cache load 0
offset = mix[26] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[0] = (mix[0] ^ data) * 33;
// random math 0
data = mix[10] ^ mix[16];
mix[4] = ROTL32(mix[4], 27) ^ data;
// cache load 1
offset = mix[30] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[27] = ROTR32(mix[27], 7) ^ data;
// random math 1
data = mix[24] & mix[14];
mix[26] = (mix[26] * 33) + data;
// cache load 2
offset = mix[1] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[13] = (mix[13] * 33) + data;
// random math 2
data = mix[17] & mix[16];
mix[15] = ROTR32(mix[15], 12) ^ data;
// cache load 3
offset = mix[19] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[17] = (mix[17] ^ data) * 33;
// random math 3
data = mul_hi(mix[31], mix[5]);
mix[7] = (mix[7] ^ data) * 33;
// cache load 4
offset = mix[11] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[14] = (mix[14] ^ data) * 33;
// random math 4
data = mix[23] * mix[19];
mix[8] = (mix[8] * 33) + data;
// cache load 5
offset = mix[21] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[9] = (mix[9] ^ data) * 33;
// random math 5
data = clz(mix[30]) + clz(mix[15]);
mix[12] = ROTR32(mix[12], 16) ^ data;
// cache load 6
offset = mix[15] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[3] = ROTR32(mix[3], 27) ^ data;
// random math 6
data = clz(mix[12]) + clz(mix[5]);
mix[10] = (mix[10] * 33) + data;
// cache load 7
offset = mix[18] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[1] = ROTR32(mix[1], 6) ^ data;
// random math 7
data = min(mix[4], mix[25]);
mix[11] = ROTR32(mix[11], 27) ^ data;
// cache load 8
offset = mix[3] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[6] = (mix[6] ^ data) * 33;
// random math 8
data = mul_hi(mix[18], mix[16]);
mix[16] = (mix[16] ^ data) * 33;
// cache load 9
offset = mix[17] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[28] = ROTL32(mix[28], 17) ^ data;
// random math 9
data = ROTL32(mix[15], mix[23]);
mix[31] = (mix[31] * 33) + data;
// cache load 10
offset = mix[31] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[2] = (mix[2] * 33) + data;
// random math 10
data = mix[11] | mix[17];
mix[19] = ROTL32(mix[19], 28) ^ data;
// cache load 11
offset = mix[16] % PROGPOW_CACHE_WORDS;
data = c_dag[offset];
mix[30] = ROTR32(mix[30], 18) ^ data;
// random math 11
data = mix[22] * mix[7];
mix[22] = ROTR32(mix[22], 30) ^ data;
// random math 12
data = mix[27] & mix[16];
mix[29] = ROTR32(mix[29], 25) ^ data;
// random math 13
data = ROTL32(mix[11], mix[0]);
mix[5] = (mix[5] ^ data) * 33;
// random math 14
data = ROTR32(mix[15], mix[25]);
mix[24] = ROTL32(mix[24], 13) ^ data;
// random math 15
data = mix[14] & mix[26];
mix[18] = (mix[18] * 33) + data;
// random math 16
data = mix[28] * mix[16];
mix[25] = (mix[25] ^ data) * 33;
// random math 17
data = mix[11] * mix[0];
mix[23] = (mix[23] ^ data) * 33;
// random math 18
data = mix[2] + mix[24];
mix[21] = ROTR32(mix[21], 20) ^ data;
// random math 19
data = mix[25] + mix[4];
mix[20] = ROTL32(mix[20], 22) ^ data;
// consume global load data
// hack to prevent compiler from reordering LD and usage
if (hack_false) __threadfence_block();
mix[0] = (mix[0] ^ data_dag.s[0]) * 33;
mix[0] = ROTR32(mix[0], 21) ^ data_dag.s[1];
mix[4] = (mix[4] * 33) + data_dag.s[2];
mix[27] = (mix[27] ^ data_dag.s[3]) * 33;
}