Skip to content

Commit

Permalink
revert kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
nginnever committed Oct 26, 2016
1 parent 3a0cbb6 commit 2cc8991
Showing 1 changed file with 55 additions and 67 deletions.
122 changes: 55 additions & 67 deletions src/libzogminer/kernels/cl_zogminer_kernel.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


/* START OF BLAKE2B CODE */


Expand All @@ -14,7 +12,6 @@ typedef ulong uint64_t;
#ifndef BLAKE2_LOCAL_INLINE
#define BLAKE2_LOCAL_INLINE(type) static inline type
#endif

enum blake2b_constant
{
BLAKE2B_BLOCKBYTES = 128,
Expand Down Expand Up @@ -520,14 +517,9 @@ void blake2b(uint8_t *out,


// TODO REMOVE THIS LINE FOR GPU
<<<<<<< HEAD
//#pragma OPENCL EXTENSION cl_intel_printf : enable
=======
#ifdef cl_intel_printf
#pragma OPENCL EXTENSION cl_intel_printf : enable
#endif
>>>>>>> 56d0e001322b37cd4ef1f6df568bed0e5a12f64a


#define EQUIHASH_N 200
#define EQUIHASH_K 9
Expand Down Expand Up @@ -568,7 +560,7 @@ void set_element_parent_bucket_data(__global element_t* dst, uint32_t parent_buc
//dst->b_parent_bucket_sub_index = b;
}

void get_element_parent_bucket_data(__global element_t* src, uint32_t* parent_bucket_index, uint8_t* a, uint8_t* b) {
void get_element_parent_bucket_data(element_t* src, uint32_t* parent_bucket_index, uint8_t* a, uint8_t* b) {
*parent_bucket_index = src->parent_bucket_data >> 8;
*a = (src->parent_bucket_data >> 4) & 0xf;
*b = (src->parent_bucket_data & 0xf);
Expand All @@ -589,21 +581,14 @@ uint32_t mask_collision_bits_step0(uint8_t* data, size_t bit_index) {
return n;
}

void memcpy_step0t1(__global digest_t *dest, void *src, size_t n) {
void memcpy_step0(__global void *dest, void *src, size_t n) {
char *csrc = (char *)src;
__global char *cdest = (__global char *)dest;

for (int i=0; i<n; i++)
cdest[i] = csrc[i];
}

void memcpy_step0t2(__global uint32_t *dest, void *src, size_t n) {
char *csrc = (char *)src;
__global char *cdest = (__global char *)dest;

for (int i=0; i<n; i++)
cdest[i] = csrc[i];
}


void xor_elements(__global uint8_t* dst, __global uint8_t* a, __global uint8_t* b) {
Expand All @@ -617,22 +602,10 @@ void xor_elements(__global uint8_t* dst, __global uint8_t* a, __global uint8_t*
__kernel void initial_bucket_hashing(__global bucket_t* dst_buckets, __global digest_t* dst_digests, __constant const blake2b_state* digest, __global volatile uint32_t* new_digest_index) {
uint8_t new_digest[2*DIGEST_SIZE];
memset(new_digest, '\0', 2*DIGEST_SIZE);
size_t start = get_global_id(0);
size_t start = get_global_id(0) * ((NUM_VALUES / 2) / get_global_size(0));
size_t end = (get_global_id(0)+1) * ((NUM_VALUES / 2) / get_global_size(0));
uint64_t tmp = *new_digest_index;

<<<<<<< HEAD
blake2b_state current_digest = *digest;
blake2b_update(&current_digest, (uint8_t*)&start, sizeof(uint32_t));
blake2b_final(&current_digest, (uint8_t*)(new_digest), 2*DIGEST_SIZE);

for(uint32_t j = 0; j < 2; ++j) {
uint32_t new_index = mask_collision_bits_step0(new_digest + (j*EQUIHASH_N/8), 0);
__global element_t* new_el = dst_buckets[new_index].data + atomic_add(&dst_buckets[new_index].size, 1);
new_el->digest_index = atomic_add(new_digest_index, 1);

set_element_parent_bucket_data(new_el, start*2 + j, 0, 0);
memcpy_step0(dst_digests + new_el->digest_index, new_digest + (j*EQUIHASH_N/8), DIGEST_SIZE);
=======
for(uint32_t i = start; i < end; ++i) {
blake2b_state current_digest = *digest;
blake2b_update(&current_digest, (uint8_t*)&i, sizeof(uint32_t));
Expand All @@ -641,66 +614,74 @@ __kernel void initial_bucket_hashing(__global bucket_t* dst_buckets, __global di
for(uint32_t j = 0; j < 2; ++j) {
uint32_t new_index = mask_collision_bits_step0(new_digest + (j*EQUIHASH_N/8), 0);
__global element_t* new_el = dst_buckets[new_index].data + atomic_add(&dst_buckets[new_index].size, 1);
new_el->digest_index = atomic_add(new_digest_index, 1);
//new_el->digest_index = atomic_add(new_digest_index, 1);
new_el->digest_index = (get_global_id(0) * 2) + j;

set_element_parent_bucket_data(new_el, i*2 + j, 0, 0);
memcpy_step0t1(dst_digests + new_el->digest_index, new_digest + (j*EQUIHASH_N/8), DIGEST_SIZE);
memcpy_step0((__global void*)(dst_digests + new_el->digest_index), new_digest + (j*EQUIHASH_N/8), DIGEST_SIZE);
}
>>>>>>> 56d0e001322b37cd4ef1f6df568bed0e5a12f64a
}
}

__kernel void bucket_collide_and_hash(__global digest_t* dst_digests, __global digest_t* src_digests, __global bucket_t* buckets, uint32_t step_index, __global volatile uint32_t* new_digest_index) {
size_t start_bit = (step_index*NUM_COLLISION_BITS);
size_t byte_index = start_bit / 8;
size_t bit_index = start_bit % 8;
size_t start = get_global_id(0) * (NUM_BUCKETS / get_global_size(0));
size_t end = (get_global_id(0)+1) * (NUM_BUCKETS / get_global_size(0));

__global bucket_t* src_buckets = buckets + (step_index-1)*NUM_BUCKETS;
__global bucket_t* dst_buckets = buckets + step_index*NUM_BUCKETS;
__global bucket_t* bucket = src_buckets+get_global_id(0);
//bucket->size = bucket->size < 13 ? bucket->size : 13;

for(size_t a = 0; a < bucket->size; ++a) {
element_t base = bucket->data[a];

__global uint8_t* base_digest = (__global uint8_t*)src_digests[base.digest_index];
uint32_t base_collision_bits = mask_collision_bits(base_digest + byte_index, bit_index);
for(size_t b = a+1; b < bucket->size; ++b) {
element_t el = bucket->data[b];
__global uint8_t* el_digest = (__global uint8_t*)src_digests[el.digest_index];
uint32_t new_index = base_collision_bits ^ mask_collision_bits(el_digest + byte_index, bit_index);
if(new_index == 0) continue;

__global element_t* new_el = dst_buckets[new_index].data + atomic_add(&dst_buckets[new_index].size, 1);
set_element_parent_bucket_data(new_el, get_global_id(0), a, b);
new_el->digest_index = atomic_add(new_digest_index, 1);

xor_elements((__global uint8_t*)(dst_digests + new_el->digest_index), base_digest, el_digest);
}
for(uint32_t current_bucket_index = start; current_bucket_index < end; ++current_bucket_index) {
__global bucket_t* bucket = src_buckets+current_bucket_index;
//bucket->size = bucket->size < 13 ? bucket->size : 13;

for(size_t a = 0; a < bucket->size; ++a) {
element_t base = bucket->data[a];

__global uint8_t* base_digest = (__global uint8_t*)src_digests[base.digest_index];
uint32_t base_collision_bits = mask_collision_bits(base_digest + byte_index, bit_index);
for(size_t b = a+1; b < bucket->size; ++b) {
element_t el = bucket->data[b];
__global uint8_t* el_digest = (__global uint8_t*)src_digests[el.digest_index];
uint32_t new_index = base_collision_bits ^ mask_collision_bits(el_digest + byte_index, bit_index);
if(new_index == 0) continue;

__global element_t* new_el = dst_buckets[new_index].data + atomic_add(&dst_buckets[new_index].size, 1);
set_element_parent_bucket_data(new_el, current_bucket_index, a, b);
new_el->digest_index = atomic_add(new_digest_index, 1);
//for(uint32_t h = 0; h < 2; h++){
// new_el->digest_index = (get_global_id(0) * 2) + h;
//}

xor_elements((__global uint8_t*)(dst_digests + new_el->digest_index), base_digest, el_digest);
}
}
bucket->size = 0;
}
bucket->size = 0;
}


void decompress_indices(uint32_t* dst_uncompressed_indices, __global bucket_t* buckets, __global element_t* old_src, __global element_t* elements) {
elements[0] = *old_src;
void decompress_indices(uint32_t* dst_uncompressed_indices, __global bucket_t* buckets, __global element_t* old_src) {
element_t elements[EQUIHASH_K][NUM_INDICES];
elements[0][0] = *old_src;

for(size_t i = 0; i < EQUIHASH_K-1; ++i) {
for(size_t j = 0; j < (1 << i); ++j) {
__global element_t* src = elements + i*NUM_INDICES + j;
element_t* src = elements[i] + j;
uint32_t parent_bucket_index;
uint8_t a;
uint8_t b;
get_element_parent_bucket_data(src, &parent_bucket_index, &a, &b);

__global bucket_t* parent_bucket = buckets + ((EQUIHASH_K-2-i) * NUM_BUCKETS) + parent_bucket_index;
elements[(i+1)*NUM_INDICES + 2*j] = parent_bucket->data[a];
elements[(i+1)*NUM_INDICES + 2*j+1] = parent_bucket->data[b];
elements[i+1][2*j] = parent_bucket->data[a];
elements[i+1][2*j+1] = parent_bucket->data[b];
}
}

for(size_t j = 0; j < NUM_INDICES/2; ++j) {
__global element_t* src = elements + (EQUIHASH_K-1)*NUM_INDICES + j;
element_t* src = elements[EQUIHASH_K-1] + j;
uint32_t parent_bucket_index;
uint8_t a;
uint8_t b;
Expand All @@ -710,12 +691,12 @@ void decompress_indices(uint32_t* dst_uncompressed_indices, __global bucket_t* b
}
}

__kernel void produce_solutions(__global uint32_t* dst_solutions, __global volatile uint32_t* n_solutions, __global bucket_t* buckets, __global digest_t* src_digests, __constant blake2b_state* digest, __global element_t* elements) {
__kernel void produce_solutions(__global uint32_t* dst_solutions, __global volatile uint32_t* n_solutions, __global bucket_t* buckets, __global digest_t* src_digests, __constant blake2b_state* digest) {
size_t start_bit = (EQUIHASH_K*NUM_COLLISION_BITS);
size_t byte_index = start_bit / 8;
size_t bit_index = start_bit % 8;
__global bucket_t* src_buckets = buckets + (EQUIHASH_K-1)*NUM_BUCKETS;
size_t start = get_global_id(0) * (NUM_BUCKETS / get_global_size(0)) + 1;
size_t start = get_global_id(0) * (NUM_BUCKETS / get_global_size(0));
size_t end = (get_global_id(0)+1) * (NUM_BUCKETS / get_global_size(0));

for(size_t i = start; i < end; ++i) {
Expand All @@ -728,11 +709,18 @@ __kernel void produce_solutions(__global uint32_t* dst_solutions, __global volat
uint32_t ai = mask_collision_bits(((__global uint8_t*)src_digests[base->digest_index]) + byte_index, bit_index);
uint32_t bi = mask_collision_bits(((__global uint8_t*)src_digests[el->digest_index]) + byte_index, bit_index);
if(ai == bi && ai != 0) {
if(b != bucket->size-1) {
//uint32_t ci = mask_collision_bits(((__global uint8_t*)src_digests[(bucket->data + b+1)->digest_index]) + byte_index, bit_index);
//if(ci == bi) {
// has_dupe = 1;
// break;
//}
}


uint32_t uncompressed_indices[NUM_INDICES];
decompress_indices(uncompressed_indices, buckets, base, elements + EQUIHASH_K*NUM_INDICES/2*get_global_id(0));
decompress_indices(uncompressed_indices + NUM_INDICES/2, buckets, el, elements + EQUIHASH_K*NUM_INDICES/2*get_global_id(0));
decompress_indices(uncompressed_indices, buckets, base);
decompress_indices(uncompressed_indices + NUM_INDICES/2, buckets, el);

for(size_t k = 0; k < NUM_INDICES && !has_dupe; ++k) {
//printf("%u ", uncompressed_indices[k]);
Expand All @@ -746,7 +734,7 @@ __kernel void produce_solutions(__global uint32_t* dst_solutions, __global volat
//printf("\n\n");
if(!has_dupe) {
//atomic_add(n_solutions, 1);
memcpy_step0t2(dst_solutions + atomic_add(n_solutions, 1)*NUM_INDICES, uncompressed_indices, NUM_INDICES*sizeof(uint32_t));
memcpy_step0(dst_solutions + atomic_add(n_solutions, 1)*NUM_INDICES, uncompressed_indices, NUM_INDICES*sizeof(uint32_t));
} else {
break;
}
Expand All @@ -755,4 +743,4 @@ __kernel void produce_solutions(__global uint32_t* dst_solutions, __global volat
}
bucket->size = 0;
}
}
}

0 comments on commit 2cc8991

Please sign in to comment.