Skip to content

Commit

Permalink
Small CryptoNight optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasjones committed May 20, 2014
1 parent 73187ba commit 927ecbe
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 69 deletions.
27 changes: 11 additions & 16 deletions crypto/aesb.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,27 +146,22 @@ d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);

void aesb_single_round(const uint8_t *in, uint8_t *out, uint8_t *expandedKey)
{
const uint32_t *kp = (uint32_t *) expandedKey;
uint32_t *i = (uint32_t*) in;
uint32_t *o = (uint32_t*) out;
round(o, i, kp);
round(((uint32_t*) out), ((uint32_t*) in), ((uint32_t*) expandedKey));
}

void aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey)
{
uint32_t b1[4];
uint32_t *v = (uint32_t*) val;
const uint32_t *kp = (uint32_t *) expandedKey;
round(b1, v, kp);
round(v, b1, kp + 1 * N_COLS);
round(b1, v, kp + 2 * N_COLS);
round(v, b1, kp + 3 * N_COLS);
round(b1, v, kp + 4 * N_COLS);
round(v, b1, kp + 5 * N_COLS);
round(b1, v, kp + 6 * N_COLS);
round(v, b1, kp + 7 * N_COLS);
round(b1, v, kp + 8 * N_COLS);
round(v, b1, kp + 9 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey));
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 1 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 2 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 3 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 4 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 5 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 6 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 7 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 8 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 9 * N_COLS);
}


Expand Down
39 changes: 25 additions & 14 deletions crypto/c_keccak.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,33 +36,44 @@ void keccakf(uint64_t st[25], int rounds)
int i, j, round;
uint64_t t, bc[5];

for (round = 0; round < rounds; round++) {
for (round = 0; round < rounds; ++round) {

// Theta
for (i = 0; i < 5; i++)
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];

for (i = 0; i < 5; i++) {
for (i = 0; i < 5; ++i) {
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
for (j = 0; j < 25; j += 5)
st[j + i] ^= t;
st[i ] ^= t;
st[i + 5] ^= t;
st[i + 10] ^= t;
st[i + 15] ^= t;
st[i + 20] ^= t;
}

// Rho Pi
t = st[1];
for (i = 0; i < 24; i++) {
j = keccakf_piln[i];
bc[0] = st[j];
st[j] = ROTL64(t, keccakf_rotc[i]);
for (i = 0; i < 24; ++i) {
bc[0] = st[keccakf_piln[i]];
st[keccakf_piln[i]] = ROTL64(t, keccakf_rotc[i]);
t = bc[0];
}

// Chi
for (j = 0; j < 25; j += 5) {
for (i = 0; i < 5; i++)
bc[i] = st[j + i];
for (i = 0; i < 5; i++)
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
bc[0] = st[j ];
bc[1] = st[j + 1];
bc[2] = st[j + 2];
bc[3] = st[j + 3];
bc[4] = st[j + 4];
st[j ] ^= (~bc[1]) & bc[2];
st[j + 1] ^= (~bc[2]) & bc[3];
st[j + 2] ^= (~bc[3]) & bc[4];
st[j + 3] ^= (~bc[4]) & bc[0];
st[j + 4] ^= (~bc[0]) & bc[1];
}

// Iota
Expand Down
6 changes: 3 additions & 3 deletions crypto/c_skein.c
Original file line number Diff line number Diff line change
Expand Up @@ -1355,15 +1355,15 @@ static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
/* run Threefish in "counter mode" to generate output */
memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */
for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++)
for (i=0;i < byteCnt;i += SKEIN_256_BLOCK_BYTES)
{
((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
Skein_Start_New_Type(ctx,OUT_FINAL);
Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
n = byteCnt - i*SKEIN_256_BLOCK_BYTES; /* number of output bytes left to go */
n = byteCnt - i; /* number of output bytes left to go */
if (n >= SKEIN_256_BLOCK_BYTES)
n = SKEIN_256_BLOCK_BYTES;
Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */
Skein_Put64_LSB_First(hashVal+i,ctx->X,n); /* "output" the ctr mode bytes */
Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
}
Expand Down
73 changes: 37 additions & 36 deletions cryptonight.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
}

static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];

Expand All @@ -95,16 +95,6 @@ static inline void copy_block(uint8_t* dst, const uint8_t* src) {
((uint64_t*) dst)[1] = ((uint64_t*) src)[1];
}

static void swap_blocks(uint8_t* a, uint8_t* b) {
size_t i;
uint8_t t;
for (i = 0; i < AES_BLOCK_SIZE; i++) {
t = a[i];
a[i] = b[i];
b[i] = t;
}
}

static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
Expand All @@ -116,12 +106,12 @@ static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* d
}

struct cryptonight_ctx {
uint8_t long_state[MEMORY];
uint8_t long_state[MEMORY] __attribute((aligned(8)));
union cn_slow_hash_state state;
uint8_t text[INIT_SIZE_BYTE];
uint8_t a[AES_BLOCK_SIZE] __attribute__((aligned(64)));
uint8_t b[AES_BLOCK_SIZE] __attribute__((aligned(64)));
uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
uint8_t a[AES_BLOCK_SIZE] __attribute__((aligned(8)));
uint8_t b[AES_BLOCK_SIZE] __attribute__((aligned(8)));
uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(8)));
oaes_ctx* aes_ctx;
};

Expand All @@ -132,43 +122,54 @@ void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cr
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);

oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY / INIT_SIZE_BYTE); ++i) {
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
memcpy(&ctx->long_state[i * INIT_SIZE_BYTE], ctx->text, INIT_SIZE_BYTE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
RND(2);
RND(3);
RND(4);
RND(5);
RND(6);
RND(7);
memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
}

xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);

for (i = 0; likely(i < ITER / 2); ++i) {
for (i = 0; likely(i < ITER / 4); ++i) {
/* Dependency chain: address -> read value ------+
* written value <-+ hard function (AES or MUL) <+
* next address <-+
*/
/* Iteration 1 */
j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j * AES_BLOCK_SIZE], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j * AES_BLOCK_SIZE]);
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
/* Iteration 2 */
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
copy_block(ctx->b, ctx->c);
/* Iteration 3 */
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
/* Iteration 4 */
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
}

memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY / INIT_SIZE_BYTE); ++i) {
for (j = 0; likely(j < INIT_SIZE_BLK); ++j) {
xor_blocks(&ctx->text[j * AES_BLOCK_SIZE],
&ctx->long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
aesb_pseudo_round_mut(&ctx->text[j * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
}
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
RND(2);
RND(3);
RND(4);
RND(5);
RND(6);
RND(7);
}
memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
hash_permutation(&ctx->state.hs);
Expand Down

0 comments on commit 927ecbe

Please sign in to comment.