Optimization of the softmax layer (Tencent#914)

* Optimize the loop structure to improve the speed of the softmax layer & Reduce memory consumption * use 4 space instead of tab
FatPaper · Apr 21, 2019 · 9ffe2b8 · 9ffe2b8
1 parent 8724440
commit 9ffe2b8
Showing 1 changed file with 13 additions and 141 deletions.
diff --git a/src/layer/softmax.cpp b/src/layer/softmax.cpp
@@ -77,14 +77,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             max = std::max(max, ptr[i]);
         }
 
-        for (int i=0; i<w; i++)
-        {
-            ptr[i] = exp(ptr[i] - max);
-        }
-
         float sum = 0.f;
         for (int i=0; i<w; i++)
         {
+            ptr[i] = exp(ptr[i] - max);
             sum += ptr[i];
         }
 
@@ -116,26 +112,18 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             }
         }
 
-        for (int i=0; i<h; i++)
-        {
-            float* ptr = bottom_top_blob.row(i);
-            for (int j=0; j<w; j++)
-            {
-                ptr[j] = exp(ptr[j] - max[j]);
-            }
-        }
-
         Mat sum;
         sum.create(w, elemsize, opt.workspace_allocator);
         if (sum.empty())
             return -100;
         sum.fill(0.f);
 
-        for (int i=0; i<h; i++)
+        for (int i = 0; i<h; i++)
         {
-            const float* ptr = bottom_top_blob.row(i);
+            float* ptr = bottom_top_blob.row(i);
             for (int j=0; j<w; j++)
             {
+                ptr[j] = exp(ptr[j] - max[j]);
                 sum[j] += ptr[j];
             }
         }
@@ -157,58 +145,22 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int w = bottom_top_blob.w;
         int h = bottom_top_blob.h;
 
-        Mat max;
-        max.create(h, elemsize, opt.workspace_allocator);
-        if (max.empty())
-            return -100;
-
         for (int i=0; i<h; i++)
         {
-            const float* ptr = bottom_top_blob.row(i);
-
+            float* ptr = bottom_top_blob.row(i);
             float m = -FLT_MAX;
             for (int j=0; j<w; j++)
             {
                 m = std::max(m, ptr[j]);
             }
 
-            max[i] = m;
-        }
-
-        for (int i=0; i<h; i++)
-        {
-            float* ptr = bottom_top_blob.row(i);
-
-            float m = max[i];
-            for (int j=0; j<w; j++)
-            {
-                ptr[j] = exp(ptr[j] - m);
-            }
-        }
-
-        Mat sum;
-        sum.create(h, elemsize, opt.workspace_allocator);
-        if (sum.empty())
-            return -100;
-
-        for (int i=0; i<h; i++)
-        {
-            const float* ptr = bottom_top_blob.row(i);
-
             float s = 0.f;
             for (int j=0; j<w; j++)
             {
+                ptr[j] = exp(ptr[j] - m);
                 s += ptr[j];
             }
 
-            sum[i] = s;
-        }
-
-        for (int i=0; i<h; i++)
-        {
-            float* ptr = bottom_top_blob.row(i);
-
-            float s = sum[i];
             for (int j=0; j<w; j++)
             {
                 ptr[j] /= s;
@@ -240,28 +192,18 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             }
         }
 
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q=0; q<channels; q++)
-        {
-            float* ptr = bottom_top_blob.channel(q);
-
-            for (int i=0; i<size; i++)
-            {
-                ptr[i] = exp(ptr[i] - max[i]);
-            }
-        }
-
         Mat sum;
         sum.create(w, h, elemsize, opt.workspace_allocator);
         if (sum.empty())
             return -100;
         sum.fill(0.f);
         for (int q=0; q<channels; q++)
         {
-            const float* ptr = bottom_top_blob.channel(q);
+            float* ptr = bottom_top_blob.channel(q);
 
             for (int i=0; i<size; i++)
             {
+                ptr[i] = exp(ptr[i] - max[i]);
                 sum[i] += ptr[i];
             }
         }
@@ -308,23 +250,6 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             }
         }
 
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q=0; q<channels; q++)
-        {
-            float* ptr = bottom_top_blob.channel(q);
-            float* maxptr = max.row(q);
-
-            for (int i=0; i<h; i++)
-            {
-                for (int j=0; j<w; j++)
-                {
-                    ptr[j] = exp(ptr[j] - maxptr[j]);
-                }
-
-                ptr += w;
-            }
-        }
-
         Mat sum;
         sum.create(w, channels, elemsize, opt.workspace_allocator);
         if (sum.empty())
@@ -333,13 +258,15 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
-            const float* ptr = bottom_top_blob.channel(q);
+            float* ptr = bottom_top_blob.channel(q);
+            float* maxptr = max.row(q);
             float* sumptr = sum.row(q);
 
             for (int i=0; i<h; i++)
             {
                 for (int j=0; j<w; j++)
                 {
+                    ptr[j] = exp(ptr[j] - maxptr[j]);
                     sumptr[j] += ptr[j];
                 }
 
@@ -373,16 +300,10 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int h = bottom_top_blob.h;
         int channels = bottom_top_blob.c;
 
-        Mat max;
-        max.create(h, channels, elemsize, opt.workspace_allocator);
-        if (max.empty())
-            return -100;
-        max.fill(-FLT_MAX);
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<channels; q++)
         {
-            const float* ptr = bottom_top_blob.channel(q);
-            float* maxptr = max.row(q);
+            float* ptr = bottom_top_blob.channel(q);
 
             for (int i=0; i<h; i++)
             {
@@ -392,62 +313,13 @@ int Softmax::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     max = std::max(max, ptr[j]);
                 }
 
-                maxptr[i] = max;
-                ptr += w;
-            }
-        }
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q=0; q<channels; q++)
-        {
-            float* ptr = bottom_top_blob.channel(q);
-            float* maxptr = max.row(q);
-
-            for (int i=0; i<h; i++)
-            {
-                float max = maxptr[i];
-                for (int j=0; j<w; j++)
-                {
-                    ptr[j] = exp(ptr[j] - max);
-                }
-
-                ptr += w;
-            }
-        }
-
-        Mat sum;
-        sum.create(h, channels, elemsize, opt.workspace_allocator);
-        if (sum.empty())
-            return -100;
-        sum.fill(0.f);
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q=0; q<channels; q++)
-        {
-            const float* ptr = bottom_top_blob.channel(q);
-            float* sumptr = sum.row(q);
-
-            for (int i=0; i<h; i++)
-            {
                 float sum = 0.f;
                 for (int j=0; j<w; j++)
                 {
+                    ptr[j] = exp(ptr[j] - max);
                     sum += ptr[j];
                 }
 
-                sumptr[i] = sum;
-                ptr += w;
-            }
-        }
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q=0; q<channels; q++)
-        {
-            float* ptr = bottom_top_blob.channel(q);
-            float* sumptr = sum.row(q);
-
-            for (int i=0; i<h; i++)
-            {
-                float sum = sumptr[i];
                 for (int j=0; j<w; j++)
                 {
                     ptr[j] /= sum;