Skip to content

Commit

Permalink
Add stride3 min/max and tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
petecoup committed Jan 10, 2012
1 parent c89a8af commit 3871988
Show file tree
Hide file tree
Showing 3 changed files with 260 additions and 49 deletions.
13 changes: 8 additions & 5 deletions include/argon/primitives/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,16 @@ uint8_t ar_vmaxall_u8(const uint8_t* a, uint32_t n);
uint8_t ar_vminall_u8(const uint8_t* a, uint32_t n);
void ar_stride2_vmaxall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result);
uint8_t* line_results);
void ar_stride2_vminall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result);

uint8_t* line0_result);

void ar_stride3_vmaxall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line_results);
void ar_stride3_vminall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line_results);
#endif

188 changes: 152 additions & 36 deletions src/primitives/ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -354,14 +354,13 @@ uint8_t ar_vminall_u8(const uint8_t* a, uint32_t n)
#ifdef ENABLE_NEON_OPTS
void ar_stride2_vminall_u8_neon(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result)
uint8_t* line_results)
{
uint8x16x2_t a_loaded;
uint8x16_t line0_min = vdupq_n_u8(255);
uint8x16_t line1_min = vdupq_n_u8(255);
*line0_result = 255;
*line1_result = 255;
line_results[0] = 255;
line_results[1] = 255;

uint8_t line0_array[16];
uint8_t line1_array[16];
Expand All @@ -376,52 +375,47 @@ void ar_stride2_vminall_u8_neon(const uint8_t* a,
vst1q_u8(line1_array, line1_min);

for (uint32_t i = 0; i < 16; i++) {
*line0_result = ar_min_u8(*line0_result, line0_array[i]);
*line1_result = ar_min_u8(*line1_result, line1_array[i]);
line_results[0] = ar_min_u8(line_results[0], line0_array[i]);
line_results[1] = ar_min_u8(line_results[1], line1_array[i]);
}
}
#endif

void ar_stride2_vminall_u8_generic(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result)
uint8_t* line_results)
{
uint8_t minall_line0 = 255;
uint8_t minall_line1 = 255;
line_results[0] = 255;
line_results[1] = 255;

for (uint32_t i = 0; i < n; i += 2) {
minall_line0 = ar_min_u8(minall_line0, a[i]);
minall_line1 = ar_min_u8(minall_line1, a[i+1]);
line_results[0] = ar_min_u8(line_results[0], a[i]);
line_results[1] = ar_min_u8(line_results[1], a[i+1]);
}
*line0_result = minall_line0;
*line1_result = minall_line1;
}

void ar_stride2_vminall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result)
uint8_t* line_results)
{
#ifdef ENABLE_NEON_OPTS
return ar_stride2_vminall_u8_neon(a,n,line0_result,line1_result);
return ar_stride2_vminall_u8_neon(a,n,line_results);
#else
return ar_stride2_vminall_u8_generic(a,n,line0_result,line1_result);
return ar_stride2_vminall_u8_generic(a,n,line_results);
#endif
}

//-----------------------------------------------------------------------------
#ifdef ENABLE_NEON_OPTS
void ar_stride2_vmaxall_u8_neon(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result)
uint8_t* line_results)
{
uint8x16x2_t a_loaded;
uint8x16_t line0_max = vdupq_n_u8(0);
uint8x16_t line1_max = vdupq_n_u8(0);
*line0_result = 0;
*line1_result = 0;
line_results[0] = 0;
line_results[1] = 0;

uint8_t line0_array[16];
uint8_t line1_array[16];
Expand All @@ -436,37 +430,159 @@ void ar_stride2_vmaxall_u8_neon(const uint8_t* a,
vst1q_u8(line1_array, line1_max);

for (uint32_t i = 0; i < 16; i++) {
*line0_result = ar_max_u8(*line0_result, line0_array[i]);
*line1_result = ar_max_u8(*line1_result, line1_array[i]);
line_results[0] = ar_max_u8(line_results[0], line0_array[i]);
line_results[1] = ar_max_u8(line_results[1], line1_array[i]);
}
}
#endif

void ar_stride2_vmaxall_u8_generic(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result)
uint8_t* line_results)
{
uint8_t maxall_line0 = 0;
uint8_t maxall_line1 = 0;
line_results[0] = 0;
line_results[1] = 0;

for (uint32_t i = 0; i < n; i += 2) {
maxall_line0 = ar_max_u8(maxall_line0, a[i]);
maxall_line1 = ar_max_u8(maxall_line1, a[i+1]);
line_results[0] = ar_max_u8(line_results[0], a[i]);
line_results[1] = ar_max_u8(line_results[1], a[i+1]);
}
*line0_result = maxall_line0;
*line1_result = maxall_line1;
}

void ar_stride2_vmaxall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line0_result,
uint8_t* line1_result)
uint8_t* line_results)
{
#ifdef ENABLE_NEON_OPTS
return ar_stride2_vmaxall_u8_neon(a,n,line0_result,line1_result);
return ar_stride2_vmaxall_u8_neon(a,n,line_results);
#else
return ar_stride2_vmaxall_u8_generic(a,n,line0_result,line1_result);
return ar_stride2_vmaxall_u8_generic(a,n,line_results);
#endif
}

//-----------------------------------------------------------------------------
#ifdef ENABLE_NEON_OPTS
void ar_stride3_vmaxall_u8_neon(const uint8_t* a,
uint32_t n,
uint8_t* line_results)
{
uint8x16x3_t a_loaded;
uint8x16_t line0_max = vdupq_n_u8(0);
uint8x16_t line1_max = vdupq_n_u8(0);
uint8x16_t line2_max = vdupq_n_u8(0);

line_results[0] = 0;
line_results[0] = 0;
line_results[0] = 0;

uint8_t line0_array[16];
uint8_t line1_array[16];
uint8_t line2_array[16];

for (uint32_t i = 0; i < n; i += 48) {
a_loaded = vld3q_u8(&(a[i]));
line0_max = vmaxq_u8(a_loaded.val[0], line0_max);
line1_max = vmaxq_u8(a_loaded.val[1], line1_max);
line2_max = vmaxq_u8(a_loaded.val[2], line2_max);
}

vst1q_u8(line0_array, line0_max);
vst1q_u8(line1_array, line1_max);

for (uint32_t i = 0; i < 16; i++) {
line_results[0] = ar_max_u8(line_results[0], line0_array[i]);
line_results[1] = ar_max_u8(line_results[1], line1_array[i]);
line_results[2] = ar_max_u8(line_results[2], line2_array[i]);
}
}
#endif

void ar_stride3_vmaxall_u8_generic(const uint8_t* a,
uint32_t n,
uint8_t* line_results)
{
line_results[0] = 0;
line_results[1] = 0;
line_results[2] = 0;

for (uint32_t i = 0; i < n; i += 3) {
line_results[0] = ar_max_u8(line_results[0], a[i]);
line_results[1] = ar_max_u8(line_results[1], a[i+1]);
line_results[2] = ar_max_u8(line_results[2], a[i+2]);
}
}

void ar_stride3_vmaxall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line_results)
{
#ifdef ENABLE_NEON_OPTS
return ar_stride3_vmaxall_u8_neon(a,n,line_results);
#else
return ar_stride3_vmaxall_u8_generic(a,n,line_results);
#endif
}

//-----------------------------------------------------------------------------
#ifdef ENABLE_NEON_OPTS
void ar_stride3_vminall_u8_neon(const uint8_t* a,
uint32_t n,
uint8_t* line_results)
{
uint8x16x3_t a_loaded;
uint8x16_t line0_min = vdupq_n_u8(255);
uint8x16_t line1_min = vdupq_n_u8(255);
uint8x16_t line2_min = vdupq_n_u8(255);

line_results[0] = 255;
line_results[0] = 255;
line_results[0] = 255;

uint8_t line0_array[16];
uint8_t line1_array[16];
uint8_t line2_array[16];

for (uint32_t i = 0; i < n; i += 48) {
a_loaded = vld3q_u8(&(a[i]));
line0_min = vminq_u8(a_loaded.val[0], line0_min);
line1_min = vminq_u8(a_loaded.val[1], line1_min);
line2_min = vminq_u8(a_loaded.val[2], line2_min);
}

vst1q_u8(line0_array, line0_min);
vst1q_u8(line1_array, line1_min);

for (uint32_t i = 0; i < 16; i++) {
line_results[0] = ar_min_u8(line_results[0], line0_array[i]);
line_results[1] = ar_min_u8(line_results[1], line1_array[i]);
line_results[2] = ar_min_u8(line_results[2], line2_array[i]);
}
}
#endif

void ar_stride3_vminall_u8_generic(const uint8_t* a,
uint32_t n,
uint8_t* line_results)
{
line_results[0] = 255;
line_results[1] = 255;
line_results[2] = 255;

for (uint32_t i = 0; i < n; i += 3) {
line_results[0] = ar_min_u8(line_results[0], a[i]);
line_results[1] = ar_min_u8(line_results[1], a[i+1]);
line_results[2] = ar_min_u8(line_results[2], a[i+2]);
}
}

void ar_stride3_vminall_u8(const uint8_t* a,
uint32_t n,
uint8_t* line_results)
{
#ifdef ENABLE_NEON_OPTS
return ar_stride3_vminall_u8_neon(a,n,line_results);
#else
return ar_stride3_vminall_u8_generic(a,n,line_results);
#endif
}

Loading

0 comments on commit 3871988

Please sign in to comment.