From 900b05b9574e5748b469574c5a6b9dd6f4d10342 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@meta.com>
Date: Thu, 6 Jun 2024 10:24:38 -0700
Subject: [PATCH] Add a new config for row-wise quant fp8 gemm perf bench with
 fp8_fast_accum=false (#2686)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2686

Adding a new config learned from cuBLAS.

Reviewed By: jianyuh

Differential Revision: D57746696

fbshipit-source-id: 1d34766a4aaa874d42338be2867d67be45a5152e
---
 fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
index 5dbe179cbd..4473b4b29b 100644
--- a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
+++ b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -332,7 +332,14 @@ def _kernel_matmul_fp8_row(
 
 
 @triton.autotune(
-    configs=MATMUL_CONFIGS,
+    configs=MATMUL_CONFIGS
+    + [
+        Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1},
+            num_stages=3,
+            num_warps=8,
+        ),
+    ],
     key=[
         "m_key",
         "n_key",