Skip to content

Commit

Permalink
[CI] Run the documentation workload on a100-runner-set (triton-lang…
Browse files Browse the repository at this point in the history
  • Loading branch information
Jokeren authored Jun 11, 2024
1 parent 9a0a7c2 commit 053aa0d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ permissions: read-all

jobs:
Build-Documentation:
runs-on: [self-hosted, A100]
runs-on: [a100-runner-set]
timeout-minutes: 30

steps:
Expand Down
46 changes: 23 additions & 23 deletions python/tutorials/09-persistent-fp8-matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
Persistent FP8 Matmul
=====================
This script demonstrates persistent kernel implementations of matrix multiplication using Triton.
It includes various matmul methods, such as naive, persistent, and TMA (Tile Matrix Accumulation) based approaches, and only supports GPUs with compute capability >= 9.0.
It includes various matmul methods, such as naive, persistent, and TMA (Tensor Memory Accelerator) based approaches, and only supports GPUs with compute capability >= 9.0.
Triton and CuBLAS implementations are benchmarked under different configurations and evaluated using the proton profiler.
Users can pass command-line arguments to specify matrix dimensions and iteration steps flexibly.
"""

import argparse
import sys
import time

import numpy as np
Expand All @@ -19,12 +18,9 @@

from triton._C.libtriton import nvidia

if not (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9):
print("This tutorial fp8_matmul is only supported on CUDA with cc >= 90")
sys.exit(0)

cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
cublas = nvidia.cublas.CublasLt(cublas_workspace)
if torch.cuda.is_available():
cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
cublas = nvidia.cublas.CublasLt(cublas_workspace)


def _matmul_launch_metadata(grid, kernel, args):
Expand Down Expand Up @@ -397,22 +393,26 @@ def validate(M, N, K):
)


parser = argparse.ArgumentParser()
parser.add_argument("-K", type=int, required=False)
parser.add_argument("--K_range", type=int, nargs=2)
parser.add_argument("--K_step", type=int, default=512)
args = parser.parse_args()
if __name__ == "__main__":
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9:
parser = argparse.ArgumentParser()
parser.add_argument("-K", type=int, required=False)
parser.add_argument("--K_range", type=int, nargs=2)
parser.add_argument("--K_step", type=int, default=512)
args = parser.parse_args()

if args.K:
args.K_range = [args.K, args.K]
args.K_step = 1 # doesn't matter as long as it's not 0
if args.K:
args.K_range = [args.K, args.K]
args.K_step = 1 # doesn't matter as long as it's not 0

torch.manual_seed(0)
torch.manual_seed(0)

validate(32, 32, 32)
validate(8192, 8192, 512)
validate(32, 32, 32)
validate(8192, 8192, 512)

proton.start("matmul", hook="triton")
for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step):
bench(K)
proton.finalize()
proton.start("matmul", hook="triton")
for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step):
bench(K)
proton.finalize()
else:
print("This tutorial fp8_matmul is only supported on CUDA with cc >= 90")

0 comments on commit 053aa0d

Please sign in to comment.