Skip to content

Commit

Permalink
checkhash: simplify the common function
Browse files Browse the repository at this point in the history
use klaus trivial function, the old code has always been a bit weird..

split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff
  • Loading branch information
tpruvot committed Nov 30, 2014
1 parent cc9df2d commit 118a6be
Show file tree
Hide file tree
Showing 15 changed files with 114 additions and 87 deletions.
5 changes: 3 additions & 2 deletions JHA/jackpotcoin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st
uint32_t *d_nonces2, size_t *nrm2,
int order);

extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_jackpotNonces[8];
static uint32_t *d_branch1Nonces[8];
Expand Down Expand Up @@ -205,8 +207,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
}

// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
unsigned int rounds;
Expand Down
150 changes: 92 additions & 58 deletions cuda_checkhash.cu
Original file line number Diff line number Diff line change
@@ -1,45 +1,16 @@
/**
* This code compares final hash against target
*/
#include <stdio.h>
#include <memory.h>

#include "cuda_helper.h"

// Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8];

static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];

__global__
void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition];
uint32_t hash[8];

#pragma unroll 8
for (int i=0; i < 8; i++)
hash[i] = inpHash[i];

for (int i = 7; i >= 0; i--) {
if (hash[i] > pTarget[i]) {
return;
}
if (hash[i] <= pTarget[i]) {
break;
}
}
if (resNounce[0] > nounce)
resNounce[0] = nounce;
}
}

// Setup-Funktionen
__host__
void cuda_check_cpu_init(int thr_id, int threads)
{
Expand All @@ -54,71 +25,134 @@ void cuda_check_cpu_setTarget(const void *ptarget)
CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
}

/* --------------------------------------------------------------------------------------------- */

__device__ __forceinline__
static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
{
if (hash[7] > target[7])
return false;
if (hash[7] < target[7])
return true;
if (hash[6] > target[6])
return false;
if (hash[6] < target[6])
return true;

if (hash[5] > target[5])
return false;
if (hash[5] < target[5])
return true;
if (hash[4] > target[4])
return false;
if (hash[4] < target[4])
return true;

if (hash[3] > target[3])
return false;
if (hash[3] < target[3])
return true;
if (hash[2] > target[2])
return false;
if (hash[2] < target[2])
return true;

if (hash[1] > target[1])
return false;
if (hash[1] < target[1])
return true;
if (hash[0] > target[0])
return false;

return true;
}

__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// shl 4 = *16 x 4 (uint32) = 64 bytes
uint32_t *inpHash = &hash[thread << 4];

if (hashbelowtarget(inpHash, pTarget)) {
uint32_t nounce = (startNounce + thread);
resNounce[0] = nounce;
}
}
}

__host__
uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

const int threadsperblock = 256;
const int threadsperblock = 512;

dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);

cuda_check_gpu_hash_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);

// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id);
cudaThreadSynchronize();

// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

// cudaMemcpy() ist asynchron!
cudaThreadSynchronize();
result = *h_resNounce[thr_id];

return result;
}

/* --------------------------------------------------------------------------------------------- */

__global__
void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce)
void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
/* only test the last 2 dwords, ok for most algos */
int hashPos = thread << 4;
uint32_t *inpHash = &hashEnd[hashPos];
uint32_t nounce = g_nonceVector[thread];
uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition];
//uint32_t hash[8];

if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) {
uint32_t nounce = (startNounce + thread);
if (resNounce[0] > nounce)
resNounce[0] = nounce;
//#pragma unroll 8
//for (int i=0; i < 8; i++)
// hash[i] = inpHash[i];

for (int i = 7; i >= 0; i--) {
if (inpHash[i] > pTarget[i]) {
return;
}
if (inpHash[i] < pTarget[i]) {
break;
}
}
if (resNounce[0] > nounce)
resNounce[0] = nounce;
}
}

__host__
uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

const int threadsperblock = 256;

dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);

cuda_check_gpu_hash_fast <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);

// MyStreamSynchronize(NULL, order, thr_id);
cudaThreadSynchronize();
MyStreamSynchronize(NULL, order, thr_id);

cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

// cudaMemcpy() was asynchron ?
// cudaThreadSynchronize();
cudaThreadSynchronize();
result = *h_resNounce[thr_id];

return result;
}
}
3 changes: 1 addition & 2 deletions cuda_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ extern "C" long device_sm[8];
// common functions
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);

Expand Down
4 changes: 1 addition & 3 deletions cuda_nist5.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ extern "C"

#include "cuda_helper.h"

// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];

extern void quark_blake512_cpu_init(int thr_id, int threads);
Expand Down Expand Up @@ -113,8 +112,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
Expand Down
6 changes: 4 additions & 2 deletions quark/animecoin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
uint32_t *d_nonces1, size_t *nrm1,
int order);

// Original Quarkhash Funktion aus einem miner Quelltext
extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

/* CPU Hash */
extern "C" void animehash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
Expand Down Expand Up @@ -255,7 +257,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
Expand Down
4 changes: 3 additions & 1 deletion quark/quarkcoin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
uint32_t *d_nonces1, size_t *nrm1,
int order);

extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

// Original Quarkhash Funktion aus einem miner Quelltext
extern "C" void quarkhash(void *state, const void *input)
{
Expand Down Expand Up @@ -227,7 +229,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
const uint32_t Htarg = ptarget[7];
Expand Down
3 changes: 2 additions & 1 deletion qubit/deep.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
Expand Down
2 changes: 1 addition & 1 deletion qubit/qubit.cu
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
Expand Down
2 changes: 1 addition & 1 deletion x11/fresh.cu
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
print_hash((unsigned char*)buf); printf("\n");
#endif

foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
Expand Down
2 changes: 1 addition & 1 deletion x11/s3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

if (foundNonce != 0xffffffff)
{
Expand Down
11 changes: 1 addition & 10 deletions x11/x11.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);

// to check... new sp method
//extern void x11_echo512_cpu_setTarget(const void *ptarget);
//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

// X11 CPU Hash
extern "C" void x11hash(void *output, const void *input)
{
Expand Down Expand Up @@ -172,7 +168,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,

quark_blake512_cpu_setBlock_80((void*)endiandata);

//x11_echo512_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);

do {
Expand All @@ -193,11 +188,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

// todo...
//foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);

foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
Expand Down
3 changes: 1 addition & 2 deletions x13/x13.cu
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

// Scan nach Gewinner Hashes auf der GPU
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
const uint32_t Htarg = ptarget[7];
Expand Down
2 changes: 1 addition & 1 deletion x15/x14.cu
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
/* check now with the CPU to confirm */
Expand Down
2 changes: 1 addition & 1 deletion x15/x15.cu
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

/* Scan with GPU */
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

if (foundNonce != 0xffffffff)
{
Expand Down
Loading

0 comments on commit 118a6be

Please sign in to comment.