Skip to content

Commit

Permalink
Feat/perf opt (NVIDIA#462)
Browse files Browse the repository at this point in the history
* feat: optimize the string usage for many utils

* feat: support streaming output for T5 Triton

* feat: add deviceId to hashId to support multi-GPU cubin loading

* feat: support fp8 bert on ada
  • Loading branch information
byshiue authored Feb 21, 2023
1 parent 43ea4f3 commit fba7567
Show file tree
Hide file tree
Showing 30 changed files with 111,489 additions and 221 deletions.
6 changes: 5 additions & 1 deletion 3rdparty/fp8_qgmma_1x1/sharedCubinLoader.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,11 @@ class TSharedCubinKernelFactory

inline uint64_t hashID(int32_t sm) const
{
return (uint64_t)sm;
// Concatenate sm with deviceID to support Multi-GPU cubin loading
// Bottom 32 bits are for SM, top 32 bits for deviceID
int32_t deviceID{0};
cudaGetDevice(&deviceID);
return (uint64_t) deviceID << 32 | (uint64_t)sm;
}

std::unordered_map<uint64_t, std::unique_ptr<TKernelList> const> mKernels;
Expand Down
41 changes: 41 additions & 0 deletions 3rdparty/trt_fp8_fmha/fused_multihead_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,22 @@ extern unsigned char cubin_fmha_v2_e4m3_192_64_ldgsts_sm90_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_256_64_ldgsts_sm90_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_384_64_ldgsts_sm90_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_512_64_ldgsts_sm90_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_fp32_128_64_sm89_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_fp32_192_64_sm89_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_fp32_256_64_sm89_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_fp32_384_64_sm89_cu_cubin[];
extern unsigned char cubin_fmha_v2_e4m3_fp32_512_64_sm89_cu_cubin[];

extern unsigned int cubin_fmha_v2_e4m3_128_64_ldgsts_sm90_cu_cubin_len;
extern unsigned int cubin_fmha_v2_e4m3_192_64_ldgsts_sm90_cu_cubin_len;
extern unsigned int cubin_fmha_v2_e4m3_256_64_ldgsts_sm90_cu_cubin_len;
extern unsigned int cubin_fmha_v2_e4m3_384_64_ldgsts_sm90_cu_cubin_len;
extern unsigned int cubin_fmha_v2_e4m3_512_64_ldgsts_sm90_cu_cubin_len;
extern uint32_t cubin_fmha_v2_e4m3_fp32_128_64_sm89_cu_cubin_len;
extern uint32_t cubin_fmha_v2_e4m3_fp32_192_64_sm89_cu_cubin_len;
extern uint32_t cubin_fmha_v2_e4m3_fp32_256_64_sm89_cu_cubin_len;
extern uint32_t cubin_fmha_v2_e4m3_fp32_384_64_sm89_cu_cubin_len;
extern uint32_t cubin_fmha_v2_e4m3_fp32_512_64_sm89_cu_cubin_len;

static const struct FusedMultiHeadAttentionKernelMetaInfoFP8V2
{
Expand All @@ -131,6 +141,37 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoFP8V2
bool mInterleaved;
} sMhaKernelMetaInfosFP8V2[] = {
#if CUDA_VERSION >= 11080
// Ada
{DATA_TYPE_E4M3, 128, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_128_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_128_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_128_64_sm89_kernel", 32768, 128, 0, false},
{DATA_TYPE_E4M3, 128, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_128_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_128_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_128_64_sm89_kernel_nl", 20480, 128, 16, false},
{DATA_TYPE_E4M3, 192, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_192_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_192_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_192_64_sm89_kernel", 36864, 128, 0, false},
{DATA_TYPE_E4M3, 192, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_192_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_192_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_192_64_sm89_kernel_nl", 36864, 128, 32, false},
{DATA_TYPE_E4M3, 256, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_256_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_256_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_256_64_sm89_kernel", 36864, 128, 0, false},
{DATA_TYPE_E4M3, 256, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_256_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_256_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_256_64_sm89_kernel_nl", 36864, 128, 32, false},
{DATA_TYPE_E4M3, 384, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_384_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_384_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_384_64_sm89_kernel", 53248, 128, 0, false},
{DATA_TYPE_E4M3, 384, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_384_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_384_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_384_64_sm89_kernel_nl", 53248, 128, 32, false},
{DATA_TYPE_E4M3, 512, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_512_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_512_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_512_64_sm89_kernel", 73728, 256, 0, false},
{DATA_TYPE_E4M3, 512, 64, kSM_89, cubin_fmha_v2_e4m3_fp32_512_64_sm89_cu_cubin,
cubin_fmha_v2_e4m3_fp32_512_64_sm89_cu_cubin_len,
"fmha_v2_e4m3_fp32_512_64_sm89_kernel_nl", 73728, 256, 32, false},
// Hopper
{DATA_TYPE_E4M3, 128, 64, kSM_90, cubin_fmha_v2_e4m3_128_64_ldgsts_sm90_cu_cubin,
cubin_fmha_v2_e4m3_128_64_ldgsts_sm90_cu_cubin_len,
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/trt_fused_multihead_attention/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ set(trt_fused_multi_head_attention_files
)

file(GLOB trt_fused_multi_head_attention_files ${trt_fused_multi_head_attention_files} *sm*.cpp)
file(GLOB trt_fused_multi_head_attention_files ${trt_fused_multi_head_attention_files} ./sm90/*.cubin.cpp)
file(GLOB trt_fused_multi_head_attention_files ${trt_fused_multi_head_attention_files} ./fp8/*.cubin.cpp)

add_library(trt_fused_multi_head_attention STATIC ${trt_fused_multi_head_attention_files})
target_link_libraries(trt_fused_multi_head_attention PUBLIC cuda_driver_wrapper -lcublas -lcudart)
Expand Down
Loading

0 comments on commit fba7567

Please sign in to comment.