From 69b9d45317d7d75949fabcf907a9740f349ffc2c Mon Sep 17 00:00:00 2001
From: "Yuan Yao (yuanyao)" <yuanyao@nvidia.com>
Date: Wed, 17 Jul 2024 11:33:40 -0700
Subject: [PATCH] update demoBERT perf data

Signed-off-by: Yuan Yao (yuanyao) <yuanyao@nvidia.com>
---
 demo/BERT/README.md | 352 ++++++++++++++++++++++----------------------
 1 file changed, 176 insertions(+), 176 deletions(-)

diff --git a/demo/BERT/README.md b/demo/BERT/README.md
index e2c53c19..68cc327c 100755
--- a/demo/BERT/README.md
+++ b/demo/BERT/README.md
@@ -435,78 +435,78 @@ Results were obtained by running `scripts/inference_benchmark.sh --gpu Ampere` o
 | Sequence Length | Batch Size | INT8 Latency (ms) |               |         | FP16 Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 0.54 | 0.69 | 0.54 | 0.79 | 0.79 | 0.63 |
-| 128 | 2 | 0.76 | 0.76 | 0.61 | 0.72 | 0.92 | 0.72 |
-| 128 | 4 | 0.93 | 0.93 | 0.74 | 0.93 | 1.19 | 0.93 |
-| 128 | 8 | 0.94 | 1.20 | 0.94 | 1.31 | 1.31 | 1.31 |
-| 128 | 12 | 1.20 | 1.53 | 1.21 | 1.70 | 2.15 | 1.69 |
-| 128 | 16 | 1.33 | 1.34 | 1.33 | 2.08 | 2.08 | 2.06 |
-| 128 | 24 | 1.82 | 1.82 | 1.82 | 3.05 | 3.05 | 3.03 |
-| 128 | 32 | 2.23 | 2.24 | 2.23 | 3.92 | 3.92 | 3.90 |
-| 128 | 64 | 4.19 | 4.19 | 4.14 | 7.75 | 7.76 | 7.68 |
-| 128 | 128 | 8.14 | 8.14 | 8.08 | 15.37 | 15.44 | 15.29 |
-| 384 | 1 | 1.13 | 1.13 | 1.14 | 1.25 | 1.61 | 1.26 |
-| 384 | 2 | 1.32 | 1.56 | 1.32 | 1.55 | 1.55 | 1.54 |
-| 384 | 4 | 1.66 | 2.12 | 1.66 | 2.12 | 2.12 | 2.12 |
-| 384 | 8 | 2.21 | 2.30 | 2.21 | 3.34 | 3.40 | 3.33 |
-| 384 | 12 | 3.31 | 3.32 | 3.31 | 4.84 | 4.84 | 4.79 |
-| 384 | 16 | 4.00 | 4.00 | 4.00 | 6.39 | 6.39 | 6.36 |
-| 384 | 24 | 5.70 | 5.70 | 5.69 | 9.49 | 9.49 | 9.41 |
-| 384 | 32 | 7.70 | 7.72 | 7.64 | 13.02 | 13.03 | 12.89 |
-| 384 | 64 | 14.89 | 14.90 | 14.79 | 25.16 | 25.18 | 24.85 |
-| 384 | 128 | 29.01 | 29.02 | 28.78 | 49.11 | 49.24 | 48.73 |
+| 128 | 1 | 0.53 | 0.68 | 0.54 | 0.79 | 0.79 | 0.64 |
+| 128 | 2 | 0.76 | 0.76 | 0.60 | 0.72 | 0.91 | 0.72 |
+| 128 | 4 | 0.73 | 0.92 | 0.73 | 1.03 | 1.04 | 0.93 |
+| 128 | 8 | 0.94 | 1.20 | 0.95 | 1.31 | 1.31 | 1.31 |
+| 128 | 12 | 1.19 | 1.20 | 1.19 | 1.72 | 1.73 | 1.72 |
+| 128 | 16 | 1.33 | 1.71 | 1.34 | 2.07 | 2.08 | 2.05 |
+| 128 | 24 | 1.82 | 1.82 | 1.81 | 3.04 | 3.07 | 3.01 |
+| 128 | 32 | 2.23 | 2.24 | 2.23 | 3.90 | 3.93 | 3.86 |
+| 128 | 64 | 4.15 | 4.17 | 4.12 | 7.62 | 7.70 | 7.57 |
+| 128 | 128 | 8.11 | 8.12 | 8.03 | 15.34 | 15.35 | 15.20 |
+| 384 | 1 | 1.13 | 1.45 | 1.13 | 1.24 | 1.25 | 1.24 |
+| 384 | 2 | 1.31 | 1.31 | 1.31 | 1.54 | 1.98 | 1.55 |
+| 384 | 4 | 1.66 | 1.66 | 1.66 | 2.12 | 2.12 | 2.12 |
+| 384 | 8 | 2.21 | 2.21 | 2.20 | 3.34 | 3.36 | 3.32 |
+| 384 | 12 | 3.32 | 3.32 | 3.31 | 4.78 | 4.82 | 4.77 |
+| 384 | 16 | 4.01 | 4.01 | 4.00 | 6.37 | 6.44 | 6.35 |
+| 384 | 24 | 5.71 | 5.71 | 5.70 | 9.47 | 9.49 | 9.39 |
+| 384 | 32 | 7.64 | 7.64 | 7.63 | 13.00 | 13.04 | 12.85 |
+| 384 | 64 | 14.87 | 14.88 | 14.73 | 25.12 | 25.14 | 24.78 |
+| 384 | 128 | 28.96 | 28.97 | 28.70 | 48.93 | 49.13 | 48.57 |
 
 ##### BERT Large
 
 | Sequence Length | Batch Size | INT8 Latency (ms) |               |         | FP16 Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 1.24 | 1.25 | 1.24 | 1.55 | 1.55 | 1.55 |
-| 128 | 2 | 1.43 | 1.80 | 1.43 | 1.82 | 1.82 | 1.82 |
-| 128 | 4 | 1.78 | 1.79 | 1.78 | 2.53 | 2.54 | 2.53 |
-| 128 | 8 | 2.64 | 2.64 | 2.64 | 3.99 | 4.01 | 3.96 |
-| 128 | 12 | 3.08 | 3.09 | 3.08 | 5.08 | 5.08 | 5.02 |
-| 128 | 16 | 4.03 | 4.03 | 4.03 | 6.94 | 6.94 | 6.89 |
-| 128 | 24 | 5.32 | 5.34 | 5.28 | 9.71 | 9.80 | 9.69 |
-| 128 | 32 | 7.02 | 7.09 | 6.99 | 12.95 | 13.08 | 12.89 |
-| 128 | 64 | 12.89 | 12.89 | 12.80 | 24.83 | 25.00 | 24.65 |
-| 128 | 128 | 25.28 | 25.29 | 25.05 | 49.15 | 49.41 | 48.82 |
-| 384 | 1 | 2.55 | 2.56 | 2.55 | 2.96 | 2.96 | 2.96 |
-| 384 | 2 | 3.04 | 3.04 | 3.03 | 4.00 | 4.01 | 4.00 |
-| 384 | 4 | 4.04 | 4.04 | 4.04 | 5.73 | 5.75 | 5.70 |
-| 384 | 8 | 7.17 | 7.17 | 7.16 | 11.14 | 11.16 | 11.07 |
-| 384 | 12 | 9.14 | 9.14 | 9.13 | 15.46 | 15.47 | 15.36 |
-| 384 | 16 | 12.28 | 12.40 | 12.28 | 21.20 | 21.31 | 21.06 |
-| 384 | 24 | 17.70 | 17.84 | 17.63 | 31.03 | 31.04 | 30.76 |
-| 384 | 32 | 23.29 | 23.30 | 23.11 | 41.07 | 41.31 | 40.74 |
-| 384 | 64 | 44.94 | 45.20 | 44.87 | 80.15 | 80.36 | 79.42 |
-| 384 | 128 | 87.97 | 87.99 | 87.81 | 157.22 | 157.81 | 156.05 |
+| 128 | 1 | 1.22 | 1.23 | 1.22 | 1.54 | 1.91 | 1.55 |
+| 128 | 2 | 1.42 | 1.42 | 1.41 | 1.82 | 1.82 | 1.82 |
+| 128 | 4 | 1.78 | 2.06 | 1.79 | 2.50 | 2.50 | 2.50 |
+| 128 | 8 | 2.64 | 2.64 | 2.64 | 3.98 | 3.98 | 3.98 |
+| 128 | 12 | 3.09 | 3.09 | 3.08 | 5.02 | 5.07 | 4.99 |
+| 128 | 16 | 4.09 | 4.09 | 4.08 | 6.93 | 6.94 | 6.86 |
+| 128 | 24 | 5.28 | 5.28 | 5.27 | 9.64 | 9.68 | 9.56 |
+| 128 | 32 | 7.01 | 7.01 | 6.95 | 12.92 | 13.07 | 12.85 |
+| 128 | 64 | 12.86 | 12.86 | 12.73 | 24.79 | 25.07 | 24.59 |
+| 128 | 128 | 25.03 | 25.26 | 24.99 | 49.12 | 49.28 | 48.83 |
+| 384 | 1 | 2.55 | 2.55 | 2.55 | 2.96 | 2.96 | 2.95 |
+| 384 | 2 | 3.04 | 3.04 | 3.03 | 3.90 | 3.90 | 3.90 |
+| 384 | 4 | 4.01 | 4.02 | 4.01 | 5.68 | 5.74 | 5.67 |
+| 384 | 8 | 7.18 | 7.18 | 7.17 | 11.13 | 11.13 | 11.01 |
+| 384 | 12 | 9.14 | 9.15 | 9.13 | 15.43 | 15.44 | 15.32 |
+| 384 | 16 | 12.28 | 12.28 | 12.27 | 21.14 | 21.15 | 20.90 |
+| 384 | 24 | 17.68 | 17.68 | 17.54 | 30.98 | 31.02 | 30.68 |
+| 384 | 32 | 23.24 | 23.24 | 23.02 | 41.11 | 41.20 | 40.58 |
+| 384 | 64 | 44.86 | 45.13 | 44.78 | 79.25 | 79.68 | 79.10 |
+| 384 | 128 | 87.82 | 87.84 | 87.69 | 156.70 | 157.02 | 155.61 |
 
 ##### Megatron Large with Sparsity
 
 | Sequence Length | Batch Size | INT8 QAT Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 1.11 | 1.41 | 1.12 |
-| 128 | 2 | 1.33 | 1.34 | 1.33 |
-| 128 | 4 | 1.78 | 1.78 | 1.77 |
+| 128 | 1 | 1.11 | 1.40 | 1.11 |
+| 128 | 2 | 1.33 | 1.33 | 1.33 |
+| 128 | 4 | 1.78 | 1.78 | 1.78 |
 | 128 | 8 | 2.54 | 2.54 | 2.53 |
-| 128 | 12 | 2.97 | 2.97 | 2.96 |
-| 128 | 16 | 3.90 | 3.91 | 3.90 |
-| 128 | 24 | 4.89 | 4.89 | 4.88 |
-| 128 | 32 | 6.99 | 7.01 | 6.94 |
-| 128 | 64 | 11.62 | 11.69 | 11.60 |
-| 128 | 128 | 21.38 | 21.39 | 21.21 |
-| 384 | 1 | 1.68 | 1.68 | 1.68 |
+| 128 | 12 | 2.97 | 2.97 | 2.97 |
+| 128 | 16 | 3.99 | 3.99 | 3.98 |
+| 128 | 24 | 4.91 | 4.91 | 4.90 |
+| 128 | 32 | 7.13 | 7.13 | 7.12 |
+| 128 | 64 | 11.61 | 11.62 | 11.60 |
+| 128 | 128 | 21.22 | 21.32 | 21.09 |
+| 384 | 1 | 1.71 | 2.15 | 1.71 |
 | 384 | 2 | 2.21 | 2.21 | 2.21 |
-| 384 | 4 | 3.48 | 3.48 | 3.47 |
-| 384 | 8 | 5.73 | 5.74 | 5.73 |
-| 384 | 12 | 8.37 | 8.37 | 8.35 |
-| 384 | 16 | 10.35 | 10.36 | 10.33 |
-| 384 | 24 | 14.62 | 14.62 | 14.61 |
-| 384 | 32 | 18.91 | 18.95 | 18.75 |
-| 384 | 64 | 35.84 | 35.86 | 35.61 |
-| 384 | 128 | 67.81 | 67.83 | 67.73 |
+| 384 | 4 | 3.47 | 3.48 | 3.47 |
+| 384 | 8 | 5.74 | 5.74 | 5.74 |
+| 384 | 12 | 8.21 | 8.21 | 8.20 |
+| 384 | 16 | 10.33 | 10.34 | 10.32 |
+| 384 | 24 | 14.68 | 14.69 | 14.67 |
+| 384 | 32 | 18.73 | 18.74 | 18.72 |
+| 384 | 64 | 35.77 | 35.78 | 35.49 |
+| 384 | 128 | 67.78 | 67.95 | 67.63 |
 
 ### Inference Performance NVIDIA L4
 
@@ -517,78 +517,78 @@ Results were obtained by running `scripts/inference_benchmark.sh --gpu Ampere` o
 | Sequence Length | Batch Size | INT8 Latency (ms) |               |         | FP16 Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 0.62 | 0.62 | 0.61 | 1.03 | 1.03 | 1.01 |
-| 128 | 2 | 0.81 | 0.81 | 0.78 | 1.35 | 1.37 | 1.33 |
-| 128 | 4 | 1.16 | 1.16 | 1.14 | 2.17 | 2.18 | 2.14 |
-| 128 | 8 | 1.95 | 2.00 | 1.92 | 3.68 | 3.68 | 3.60 |
-| 128 | 12 | 2.70 | 2.71 | 2.64 | 5.24 | 5.26 | 5.14 |
-| 128 | 16 | 3.44 | 3.44 | 3.34 | 6.77 | 6.77 | 6.64 |
-| 128 | 24 | 4.91 | 4.94 | 4.80 | 10.19 | 10.42 | 10.15 |
-| 128 | 32 | 6.31 | 6.40 | 6.23 | 13.57 | 13.72 | 13.41 |
-| 128 | 64 | 13.69 | 13.85 | 13.46 | 30.35 | 30.72 | 29.58 |
-| 128 | 128 | 28.90 | 29.15 | 28.61 | 66.75 | 67.06 | 66.09 |
-| 384 | 1 | 1.30 | 1.30 | 1.30 | 2.10 | 2.10 | 2.09 |
-| 384 | 2 | 1.85 | 1.86 | 1.84 | 3.18 | 3.20 | 3.17 |
-| 384 | 4 | 3.02 | 3.02 | 2.96 | 5.49 | 5.53 | 5.48 |
-| 384 | 8 | 5.60 | 5.64 | 5.50 | 11.10 | 11.11 | 10.90 |
-| 384 | 12 | 8.37 | 8.39 | 8.20 | 16.61 | 16.76 | 16.51 |
-| 384 | 16 | 11.18 | 11.19 | 11.02 | 23.24 | 23.56 | 23.16 |
-| 384 | 24 | 17.09 | 17.29 | 16.96 | 35.94 | 35.95 | 35.39 |
-| 384 | 32 | 23.38 | 23.57 | 23.17 | 50.65 | 50.92 | 50.51 |
-| 384 | 64 | 49.52 | 49.54 | 49.01 | 104.52 | 104.94 | 103.73 |
-| 384 | 128 | 104.93 | 105.33 | 103.94 | 197.12 | 197.56 | 196.03 |
+| 128 | 1 | 0.61 | 0.61 | 0.60 | 1.01 | 1.01 | 1.00 |
+| 128 | 2 | 0.79 | 0.80 | 0.77 | 1.32 | 1.35 | 1.31 |
+| 128 | 4 | 1.14 | 1.15 | 1.12 | 2.22 | 2.23 | 2.14 |
+| 128 | 8 | 1.94 | 1.96 | 1.90 | 3.66 | 3.67 | 3.63 |
+| 128 | 12 | 2.67 | 2.67 | 2.61 | 5.34 | 5.34 | 5.26 |
+| 128 | 16 | 3.37 | 3.38 | 3.32 | 6.69 | 6.69 | 6.64 |
+| 128 | 24 | 4.84 | 4.84 | 4.75 | 10.53 | 10.64 | 10.50 |
+| 128 | 32 | 6.21 | 6.28 | 6.13 | 13.91 | 13.91 | 13.72 |
+| 128 | 64 | 13.40 | 13.60 | 13.20 | 31.48 | 31.53 | 31.01 |
+| 128 | 128 | 28.42 | 28.68 | 27.84 | 70.60 | 71.10 | 69.25 |
+| 384 | 1 | 1.27 | 1.27 | 1.27 | 2.08 | 2.09 | 2.07 |
+| 384 | 2 | 1.84 | 1.84 | 1.82 | 3.15 | 3.19 | 3.11 |
+| 384 | 4 | 2.94 | 2.94 | 2.91 | 5.68 | 5.75 | 5.63 |
+| 384 | 8 | 5.53 | 5.55 | 5.42 | 11.45 | 11.59 | 11.32 |
+| 384 | 12 | 8.21 | 8.31 | 8.07 | 17.16 | 17.36 | 17.00 |
+| 384 | 16 | 10.96 | 11.07 | 10.80 | 23.20 | 23.50 | 22.81 |
+| 384 | 24 | 16.71 | 16.74 | 16.55 | 39.82 | 40.46 | 38.15 |
+| 384 | 32 | 22.82 | 23.00 | 22.63 | 50.56 | 50.89 | 50.14 |
+| 384 | 64 | 49.66 | 50.18 | 48.40 | 104.90 | 105.55 | 103.81 |
+| 384 | 128 | 104.78 | 105.09 | 103.96 | 208.20 | 208.70 | 206.93 |
 
 ##### BERT Large
 
 | Sequence Length | Batch Size | INT8 Latency (ms) |               |         | FP16 Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 1.81 | 1.82 | 1.79 | 3.15 | 3.16 | 3.12 |
-| 128 | 2 | 2.50 | 2.55 | 2.47 | 4.49 | 4.58 | 4.44 |
-| 128 | 4 | 3.60 | 3.62 | 3.59 | 6.94 | 6.95 | 6.90 |
-| 128 | 8 | 6.44 | 6.50 | 6.34 | 12.93 | 12.99 | 12.79 |
-| 128 | 12 | 8.53 | 8.53 | 8.35 | 18.26 | 18.27 | 18.08 |
-| 128 | 16 | 11.37 | 11.37 | 11.23 | 25.17 | 25.40 | 25.04 |
-| 128 | 24 | 16.13 | 16.14 | 16.09 | 35.45 | 35.45 | 35.26 |
-| 128 | 32 | 21.66 | 21.66 | 21.56 | 47.66 | 47.66 | 47.63 |
-| 128 | 64 | 47.07 | 47.08 | 46.65 | 102.00 | 102.24 | 101.29 |
-| 128 | 128 | 91.60 | 92.23 | 91.19 | 219.24 | 219.55 | 218.06 |
-| 384 | 1 | 3.47 | 3.48 | 3.47 | 6.53 | 6.63 | 6.36 |
-| 384 | 2 | 5.58 | 5.58 | 5.53 | 10.51 | 10.62 | 10.44 |
-| 384 | 4 | 9.91 | 10.01 | 9.73 | 20.58 | 20.80 | 20.10 |
-| 384 | 8 | 18.45 | 18.47 | 18.23 | 38.06 | 38.24 | 37.60 |
-| 384 | 12 | 27.03 | 27.03 | 26.72 | 58.94 | 59.27 | 58.09 |
-| 384 | 16 | 37.47 | 37.51 | 36.77 | 79.40 | 79.70 | 78.36 |
-| 384 | 24 | 55.02 | 55.25 | 54.56 | 123.06 | 123.32 | 121.71 |
-| 384 | 32 | 77.22 | 77.54 | 76.48 | 167.99 | 168.34 | 167.10 |
-| 384 | 64 | 157.21 | 157.53 | 155.69 | 335.31 | 335.96 | 333.65 |
-| 384 | 128 | 337.82 | 338.55 | 335.23 | 640.65 | 641.04 | 639.38 |
+| 128 | 1 | 1.79 | 1.80 | 1.77 | 3.11 | 3.11 | 3.09 |
+| 128 | 2 | 2.49 | 2.49 | 2.43 | 4.35 | 4.37 | 4.33 |
+| 128 | 4 | 3.62 | 3.70 | 3.60 | 6.86 | 6.89 | 6.78 |
+| 128 | 8 | 6.26 | 6.31 | 6.24 | 12.85 | 12.91 | 12.73 |
+| 128 | 12 | 8.40 | 8.41 | 8.28 | 18.42 | 18.43 | 18.33 |
+| 128 | 16 | 11.23 | 11.24 | 11.12 | 25.18 | 25.19 | 25.10 |
+| 128 | 24 | 15.95 | 16.09 | 15.90 | 35.67 | 35.67 | 35.47 |
+| 128 | 32 | 21.26 | 21.31 | 20.91 | 48.92 | 49.21 | 48.26 |
+| 128 | 64 | 44.10 | 44.11 | 43.92 | 108.81 | 109.12 | 107.18 |
+| 128 | 128 | 94.22 | 95.02 | 92.65 | 217.32 | 219.58 | 212.68 |
+| 384 | 1 | 3.41 | 3.43 | 3.39 | 6.55 | 6.57 | 6.36 |
+| 384 | 2 | 5.55 | 5.56 | 5.46 | 10.34 | 10.35 | 10.18 |
+| 384 | 4 | 9.69 | 9.79 | 9.53 | 20.66 | 20.95 | 19.94 |
+| 384 | 8 | 18.08 | 18.19 | 17.92 | 38.41 | 39.30 | 37.62 |
+| 384 | 12 | 26.20 | 26.44 | 26.11 | 60.38 | 60.91 | 58.67 |
+| 384 | 16 | 36.33 | 36.41 | 36.02 | 81.66 | 82.16 | 80.52 |
+| 384 | 24 | 53.54 | 53.61 | 53.08 | 123.01 | 123.34 | 122.10 |
+| 384 | 32 | 75.01 | 75.43 | 74.40 | 170.40 | 171.03 | 169.12 |
+| 384 | 64 | 157.97 | 158.62 | 155.87 | 349.25 | 351.53 | 344.76 |
+| 384 | 128 | 330.88 | 331.87 | 328.27 | 632.85 | 633.88 | 629.74 |
 
 ##### Megatron Large with Sparsity
 
 | Sequence Length | Batch Size | INT8 QAT Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 1.51 | 1.51 | 1.49 |
-| 128 | 2 | 2.05 | 2.06 | 2.01 |
-| 128 | 4 | 3.00 | 3.01 | 2.94 |
-| 128 | 8 | 5.06 | 5.08 | 5.05 |
-| 128 | 12 | 6.71 | 6.78 | 6.63 |
-| 128 | 16 | 8.83 | 8.84 | 8.75 |
-| 128 | 24 | 13.38 | 13.39 | 13.16 |
-| 128 | 32 | 17.61 | 17.63 | 17.50 |
-| 128 | 64 | 36.49 | 36.55 | 36.16 |
-| 128 | 128 | 80.34 | 80.39 | 79.62 |
-| 384 | 1 | 2.81 | 2.82 | 2.77 |
-| 384 | 2 | 4.20 | 4.23 | 4.12 |
-| 384 | 4 | 7.62 | 7.66 | 7.53 |
-| 384 | 8 | 15.13 | 15.15 | 14.97 |
-| 384 | 12 | 21.74 | 21.87 | 21.56 |
-| 384 | 16 | 28.83 | 29.00 | 28.70 |
-| 384 | 24 | 47.51 | 47.58 | 47.12 |
-| 384 | 32 | 61.31 | 61.50 | 60.79 |
-| 384 | 64 | 126.97 | 127.06 | 126.69 |
-| 384 | 128 | 256.27 | 256.61 | 255.09 |
+| 128 | 1 | 1.49 | 1.49 | 1.48 |
+| 128 | 2 | 2.03 | 2.03 | 1.99 |
+| 128 | 4 | 2.99 | 3.00 | 2.93 |
+| 128 | 8 | 5.00 | 5.07 | 4.99 |
+| 128 | 12 | 6.69 | 6.72 | 6.58 |
+| 128 | 16 | 8.77 | 8.84 | 8.66 |
+| 128 | 24 | 13.28 | 13.30 | 13.14 |
+| 128 | 32 | 17.41 | 17.44 | 17.26 |
+| 128 | 64 | 35.73 | 36.07 | 35.49 |
+| 128 | 128 | 79.03 | 79.15 | 78.47 |
+| 384 | 1 | 2.78 | 2.79 | 2.72 |
+| 384 | 2 | 4.10 | 4.12 | 4.06 |
+| 384 | 4 | 7.57 | 7.58 | 7.45 |
+| 384 | 8 | 15.03 | 15.10 | 14.86 |
+| 384 | 12 | 21.52 | 21.69 | 21.31 |
+| 384 | 16 | 28.29 | 28.33 | 28.10 |
+| 384 | 24 | 46.83 | 47.09 | 46.29 |
+| 384 | 32 | 60.29 | 60.47 | 59.37 |
+| 384 | 64 | 125.58 | 125.64 | 125.24 |
+| 384 | 128 | 253.46 | 253.90 | 252.28 |
 
 ### Inference Performance NVIDIA L40S
 
@@ -599,52 +599,52 @@ Results were obtained by running `scripts/inference_benchmark.sh --gpu Ampere` o
 | Sequence Length | Batch Size | INT8 Latency (ms) |               |         | FP16 Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 0.34 | 0.34 | 0.34 | 0.48 | 0.48 | 0.48 |
-| 128 | 2 | 0.41 | 0.41 | 0.41 | 0.57 | 0.57 | 0.56 |
-| 128 | 4 | 0.50 | 0.50 | 0.50 | 0.78 | 0.78 | 0.78 |
-| 128 | 8 | 0.67 | 0.67 | 0.67 | 1.30 | 1.30 | 1.29 |
-| 128 | 12 | 0.92 | 0.93 | 0.91 | 1.78 | 1.78 | 1.76 |
-| 128 | 16 | 1.10 | 1.10 | 1.10 | 2.30 | 2.31 | 2.29 |
-| 128 | 24 | 1.48 | 1.48 | 1.47 | 3.30 | 3.31 | 3.26 |
-| 128 | 32 | 1.83 | 1.84 | 1.82 | 3.98 | 3.99 | 3.96 |
-| 128 | 64 | 3.52 | 3.53 | 3.49 | 8.46 | 8.52 | 8.40 |
-| 128 | 128 | 7.63 | 7.64 | 7.58 | 17.47 | 17.57 | 17.33 |
-| 384 | 1 | 0.73 | 0.73 | 0.73 | 1.04 | 1.04 | 1.03 |
-| 384 | 2 | 0.88 | 0.88 | 0.88 | 1.36 | 1.36 | 1.36 |
-| 384 | 4 | 1.17 | 1.17 | 1.16 | 2.21 | 2.21 | 2.19 |
-| 384 | 8 | 1.73 | 1.73 | 1.72 | 3.53 | 3.53 | 3.51 |
-| 384 | 12 | 2.73 | 2.74 | 2.72 | 5.25 | 5.26 | 5.18 |
-| 384 | 16 | 3.28 | 3.29 | 3.27 | 7.58 | 7.59 | 7.53 |
-| 384 | 24 | 4.97 | 4.98 | 4.94 | 10.37 | 10.40 | 10.27 |
-| 384 | 32 | 6.47 | 6.49 | 6.40 | 14.17 | 14.20 | 14.03 |
-| 384 | 64 | 14.05 | 14.07 | 13.89 | 31.25 | 31.34 | 30.90 |
-| 384 | 128 | 29.55 | 29.77 | 28.85 | 64.72 | 65.01 | 63.83 |
+| 128 | 1 | 0.33 | 0.33 | 0.33 | 0.48 | 0.48 | 0.48 |
+| 128 | 2 | 0.41 | 0.41 | 0.41 | 0.57 | 0.57 | 0.57 |
+| 128 | 4 | 0.50 | 0.51 | 0.50 | 0.78 | 0.78 | 0.78 |
+| 128 | 8 | 0.67 | 0.67 | 0.67 | 1.33 | 1.33 | 1.32 |
+| 128 | 12 | 0.91 | 0.91 | 0.91 | 1.75 | 1.76 | 1.73 |
+| 128 | 16 | 1.10 | 1.10 | 1.09 | 2.29 | 2.29 | 2.28 |
+| 128 | 24 | 1.48 | 1.49 | 1.47 | 3.30 | 3.31 | 3.27 |
+| 128 | 32 | 1.84 | 1.84 | 1.83 | 3.98 | 3.99 | 3.97 |
+| 128 | 64 | 3.61 | 3.66 | 3.56 | 8.64 | 8.70 | 8.51 |
+| 128 | 128 | 7.92 | 7.99 | 7.82 | 18.78 | 18.82 | 18.45 |
+| 384 | 1 | 0.73 | 0.73 | 0.73 | 1.11 | 1.12 | 1.10 |
+| 384 | 2 | 0.88 | 0.88 | 0.88 | 1.39 | 1.39 | 1.38 |
+| 384 | 4 | 1.17 | 1.17 | 1.17 | 2.19 | 2.20 | 2.19 |
+| 384 | 8 | 1.74 | 1.74 | 1.73 | 3.53 | 3.53 | 3.50 |
+| 384 | 12 | 2.75 | 2.75 | 2.73 | 5.32 | 5.33 | 5.29 |
+| 384 | 16 | 3.33 | 3.33 | 3.31 | 7.62 | 7.64 | 7.57 |
+| 384 | 24 | 4.97 | 4.98 | 4.95 | 10.53 | 10.57 | 10.40 |
+| 384 | 32 | 6.55 | 6.57 | 6.48 | 14.36 | 14.47 | 14.20 |
+| 384 | 64 | 14.27 | 14.37 | 14.07 | 33.31 | 33.51 | 32.65 |
+| 384 | 128 | 30.38 | 30.52 | 29.73 | 67.34 | 68.04 | 66.06 |
 
 ##### BERT Large
 
 | Sequence Length | Batch Size | INT8 Latency (ms) |               |         | FP16 Latency (ms) |               |         |
 |-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
-| 128 | 1 | 0.88 | 0.88 | 0.88 | 1.30 | 1.30 | 1.29 |
-| 128 | 2 | 0.99 | 0.99 | 0.98 | 1.51 | 1.51 | 1.50 |
-| 128 | 4 | 1.37 | 1.37 | 1.36 | 2.30 | 2.30 | 2.28 |
-| 128 | 8 | 1.96 | 1.96 | 1.95 | 3.92 | 3.93 | 3.90 |
-| 128 | 12 | 2.83 | 2.86 | 2.81 | 5.92 | 5.93 | 5.90 |
-| 128 | 16 | 3.27 | 3.27 | 3.24 | 6.81 | 6.82 | 6.75 |
-| 128 | 24 | 4.64 | 4.64 | 4.61 | 10.25 | 10.28 | 10.19 |
-| 128 | 32 | 5.73 | 5.74 | 5.68 | 13.17 | 13.19 | 13.01 |
-| 128 | 64 | 12.00 | 12.08 | 11.89 | 28.33 | 28.35 | 28.01 |
-| 128 | 128 | 26.06 | 26.22 | 25.74 | 65.44 | 65.68 | 64.41 |
-| 384 | 1 | 1.68 | 1.68 | 1.67 | 2.72 | 2.72 | 2.71 |
-| 384 | 2 | 2.29 | 2.29 | 2.28 | 3.95 | 3.96 | 3.94 |
-| 384 | 4 | 3.31 | 3.31 | 3.30 | 6.50 | 6.55 | 6.45 |
-| 384 | 8 | 5.15 | 5.16 | 5.13 | 10.84 | 10.87 | 10.69 |
-| 384 | 12 | 8.14 | 8.15 | 8.10 | 19.89 | 19.99 | 19.37 |
-| 384 | 16 | 9.96 | 9.98 | 9.86 | 22.65 | 22.68 | 22.45 |
-| 384 | 24 | 15.37 | 15.42 | 15.23 | 35.42 | 35.49 | 35.08 |
-| 384 | 32 | 20.32 | 20.45 | 20.04 | 48.00 | 48.01 | 47.26 |
-| 384 | 64 | 44.74 | 44.94 | 43.95 | 104.17 | 104.49 | 102.96 |
-| 384 | 128 | 90.01 | 90.24 | 88.73 | 205.73 | 206.26 | 203.73 |
+| 128 | 1 | 0.89 | 0.89 | 0.88 | 1.30 | 1.30 | 1.29 |
+| 128 | 2 | 0.97 | 0.98 | 0.97 | 1.45 | 1.45 | 1.44 |
+| 128 | 4 | 1.36 | 1.36 | 1.35 | 2.30 | 2.30 | 2.29 |
+| 128 | 8 | 1.94 | 1.96 | 1.93 | 3.89 | 3.90 | 3.88 |
+| 128 | 12 | 2.82 | 2.82 | 2.80 | 5.89 | 5.90 | 5.85 |
+| 128 | 16 | 3.26 | 3.27 | 3.24 | 6.85 | 6.86 | 6.80 |
+| 128 | 24 | 4.62 | 4.63 | 4.59 | 10.72 | 10.73 | 10.64 |
+| 128 | 32 | 5.74 | 5.76 | 5.70 | 13.22 | 13.23 | 13.04 |
+| 128 | 64 | 12.18 | 12.20 | 11.97 | 29.42 | 29.59 | 28.89 |
+| 128 | 128 | 26.68 | 26.86 | 26.23 | 68.72 | 69.05 | 67.12 |
+| 384 | 1 | 1.68 | 1.68 | 1.68 | 2.78 | 2.78 | 2.77 |
+| 384 | 2 | 2.31 | 2.31 | 2.30 | 3.95 | 3.95 | 3.94 |
+| 384 | 4 | 3.29 | 3.30 | 3.29 | 6.57 | 6.58 | 6.50 |
+| 384 | 8 | 5.16 | 5.17 | 5.13 | 10.89 | 10.90 | 10.79 |
+| 384 | 12 | 8.16 | 8.17 | 8.10 | 19.81 | 19.91 | 19.31 |
+| 384 | 16 | 9.90 | 9.93 | 9.80 | 23.34 | 23.51 | 23.10 |
+| 384 | 24 | 15.60 | 15.62 | 15.39 | 37.37 | 37.48 | 36.93 |
+| 384 | 32 | 20.66 | 20.73 | 20.33 | 50.13 | 50.34 | 49.52 |
+| 384 | 64 | 46.31 | 46.53 | 45.39 | 111.74 | 111.98 | 110.14 |
+| 384 | 128 | 93.80 | 94.04 | 92.33 | 213.05 | 214.15 | 210.25 |
 
 ##### Megatron Large with Sparsity
 
@@ -652,23 +652,23 @@ Results were obtained by running `scripts/inference_benchmark.sh --gpu Ampere` o
 |-----------------|------------|-----------------|-----------------|---------|
 |                 |            | 95th Percentile | 99th Percentile | Average |
 | 128 | 1 | 0.76 | 0.76 | 0.76 |
-| 128 | 2 | 0.90 | 0.90 | 0.90 |
-| 128 | 4 | 1.14 | 1.14 | 1.13 |
-| 128 | 8 | 1.72 | 1.72 | 1.71 |
-| 128 | 12 | 2.28 | 2.28 | 2.28 |
-| 128 | 16 | 2.74 | 2.74 | 2.74 |
-| 128 | 24 | 4.53 | 4.53 | 4.52 |
-| 128 | 32 | 5.17 | 5.23 | 5.14 |
-| 128 | 64 | 10.19 | 10.20 | 10.13 |
-| 128 | 128 | 21.23 | 21.30 | 20.96 |
+| 128 | 2 | 0.91 | 0.91 | 0.91 |
+| 128 | 4 | 1.13 | 1.13 | 1.13 |
+| 128 | 8 | 1.70 | 1.70 | 1.70 |
+| 128 | 12 | 2.26 | 2.26 | 2.25 |
+| 128 | 16 | 2.72 | 2.72 | 2.71 |
+| 128 | 24 | 4.54 | 4.55 | 4.52 |
+| 128 | 32 | 5.14 | 5.16 | 5.10 |
+| 128 | 64 | 10.07 | 10.08 | 10.01 |
+| 128 | 128 | 21.57 | 21.67 | 21.21 |
 | 384 | 1 | 1.13 | 1.13 | 1.13 |
-| 384 | 2 | 1.65 | 1.65 | 1.64 |
-| 384 | 4 | 2.53 | 2.53 | 2.52 |
-| 384 | 8 | 4.99 | 5.00 | 4.98 |
-| 384 | 12 | 6.55 | 6.55 | 6.50 |
-| 384 | 16 | 8.55 | 8.56 | 8.50 |
-| 384 | 24 | 12.72 | 12.73 | 12.68 |
-| 384 | 32 | 16.78 | 16.85 | 16.67 |
-| 384 | 64 | 36.48 | 36.55 | 35.85 |
-| 384 | 128 | 78.19 | 79.69 | 76.16 |
+| 384 | 2 | 1.64 | 1.65 | 1.62 |
+| 384 | 4 | 2.51 | 2.51 | 2.50 |
+| 384 | 8 | 5.02 | 5.03 | 4.99 |
+| 384 | 12 | 6.43 | 6.43 | 6.41 |
+| 384 | 16 | 8.47 | 8.49 | 8.41 |
+| 384 | 24 | 12.62 | 12.65 | 12.54 |
+| 384 | 32 | 16.88 | 16.91 | 16.74 |
+| 384 | 64 | 36.62 | 36.71 | 36.12 |
+| 384 | 128 | 79.88 | 80.18 | 77.33 |