From 900b05b9574e5748b469574c5a6b9dd6f4d10342 Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Thu, 6 Jun 2024 10:24:38 -0700 Subject: [PATCH] Add a new config for row-wise quant fp8 gemm perf bench with fp8_fast_accum=false (#2686) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2686 Adding a new config learned from cuBLAS. Reviewed By: jianyuh Differential Revision: D57746696 fbshipit-source-id: 1d34766a4aaa874d42338be2867d67be45a5152e --- fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py index 5dbe179cbd..4473b4b29b 100644 --- a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +++ b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py @@ -332,7 +332,14 @@ def _kernel_matmul_fp8_row( @triton.autotune( - configs=MATMUL_CONFIGS, + configs=MATMUL_CONFIGS + + [ + Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + ], key=[ "m_key", "n_key",