Skip to content

Commit

Permalink
Optimization of the softmax layer (Tencent#914)
Browse files Browse the repository at this point in the history
* Optimize the loop structure to improve the speed of the softmax layer & Reduce memory consumption

* use 4 space instead of tab
  • Loading branch information
gfjiangly authored and nihui committed Apr 21, 2019
1 parent 8724440 commit 9ffe2b8
Showing 1 changed file with 13 additions and 141 deletions.
154 changes: 13 additions & 141 deletions src/layer/softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
max = std::max(max, ptr[i]);
}

for (int i=0; i<w; i++)
{
ptr[i] = exp(ptr[i] - max);
}

float sum = 0.f;
for (int i=0; i<w; i++)
{
ptr[i] = exp(ptr[i] - max);
sum += ptr[i];
}

Expand Down Expand Up @@ -116,26 +112,18 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max[j]);
}
}

Mat sum;
sum.create(w, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;
sum.fill(0.f);

for (int i=0; i<h; i++)
for (int i = 0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);
float* ptr = bottom_top_blob.row(i);
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max[j]);
sum[j] += ptr[j];
}
}
Expand All @@ -157,58 +145,22 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;

Mat max;
max.create(h, elemsize, opt.workspace_allocator);
if (max.empty())
return -100;

for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);

float* ptr = bottom_top_blob.row(i);
float m = -FLT_MAX;
for (int j=0; j<w; j++)
{
m = std::max(m, ptr[j]);
}

max[i] = m;
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);

float m = max[i];
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - m);
}
}

Mat sum;
sum.create(h, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;

for (int i=0; i<h; i++)
{
const float* ptr = bottom_top_blob.row(i);

float s = 0.f;
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - m);
s += ptr[j];
}

sum[i] = s;
}

for (int i=0; i<h; i++)
{
float* ptr = bottom_top_blob.row(i);

float s = sum[i];
for (int j=0; j<w; j++)
{
ptr[j] /= s;
Expand Down Expand Up @@ -240,28 +192,18 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
ptr[i] = exp(ptr[i] - max[i]);
}
}

Mat sum;
sum.create(w, h, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;
sum.fill(0.f);
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
ptr[i] = exp(ptr[i] - max[i]);
sum[i] += ptr[i];
}
}
Expand Down Expand Up @@ -308,23 +250,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);

for (int i=0; i<h; i++)
{
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - maxptr[j]);
}

ptr += w;
}
}

Mat sum;
sum.create(w, channels, elemsize, opt.workspace_allocator);
if (sum.empty())
Expand All @@ -333,13 +258,15 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);
float* sumptr = sum.row(q);

for (int i=0; i<h; i++)
{
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - maxptr[j]);
sumptr[j] += ptr[j];
}

Expand Down Expand Up @@ -373,16 +300,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;

Mat max;
max.create(h, channels, elemsize, opt.workspace_allocator);
if (max.empty())
return -100;
max.fill(-FLT_MAX);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);
float* ptr = bottom_top_blob.channel(q);

for (int i=0; i<h; i++)
{
Expand All @@ -392,62 +313,13 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
max = std::max(max, ptr[j]);
}

maxptr[i] = max;
ptr += w;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* maxptr = max.row(q);

for (int i=0; i<h; i++)
{
float max = maxptr[i];
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max);
}

ptr += w;
}
}

Mat sum;
sum.create(h, channels, elemsize, opt.workspace_allocator);
if (sum.empty())
return -100;
sum.fill(0.f);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum.row(q);

for (int i=0; i<h; i++)
{
float sum = 0.f;
for (int j=0; j<w; j++)
{
ptr[j] = exp(ptr[j] - max);
sum += ptr[j];
}

sumptr[i] = sum;
ptr += w;
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
float* sumptr = sum.row(q);

for (int i=0; i<h; i++)
{
float sum = sumptr[i];
for (int j=0; j<w; j++)
{
ptr[j] /= sum;
Expand Down

0 comments on commit 9ffe2b8

Please sign in to comment.